From 6899d3f8bb06499557c7d0672e371c6ee06a87e9 Mon Sep 17 00:00:00 2001
From: Jeffrey Walton <noloader@gmail.com>
Date: Tue, 12 Sep 2017 18:15:55 -0400
Subject: [PATCH] Add AdvancedProcessBlocks for Power8 This increases
 performance to about 1.6 cpb. We are about 0.5 cpb behind Botan, and about
 1.0 cpb behind OpenSSL. However, it beats the snot out of C/C++, which runs
 at 20 to 30 cpb

---
 rijndael-simd.cpp | 291 ++++++++++++++++++++++++++++++++++++++++------
 rijndael.cpp      |  23 ++--
 rijndael.h        |   3 +-
 3 files changed, 273 insertions(+), 44 deletions(-)
diff --git a/rijndael-simd.cpp b/rijndael-simd.cpp
index 0c624fa8..ff46e564 100644
--- a/rijndael-simd.cpp
+++ b/rijndael-simd.cpp
@@ -817,6 +817,22 @@ uint8x16_p8 Load8x16(const uint8_t src[16])
 #endif
 }
 
+uint8x16_p8 Load8x16(int off, const uint8_t src[16])
+{
+#if defined(CRYPTOPP_XLC_VERSION)
+	/* http://stackoverflow.com/q/46124383/608639 */
+	uint8_t* s = (uint8_t*)src;
+# if defined(IS_LITTLE_ENDIAN)
+	return vec_xl_be(off, s);
+# else
+	return vec_xl(off, s);
+# endif
+#else
+	/* GCC, Clang, etc */
+	return (uint8x16_p8)vec_vsx_ld(off, src);
+#endif
+}
+
 void Store8x16(const uint8x16_p8 src, uint8_t dest[16])
 {
 #if defined(CRYPTOPP_XLC_VERSION)
@@ -854,6 +870,28 @@ uint64x2_p8 Load64x2(const uint8_t src[16])
 #endif
 }
 
+uint64x2_p8 Load64x2(int off, const uint8_t src[16])
+{
+#if defined(CRYPTOPP_XLC_VERSION)
+	/* http://stackoverflow.com/q/46124383/608639 */
+	uint8_t* s = (uint8_t*)src;
+# if defined(IS_LITTLE_ENDIAN)
+	return (uint64x2_p8)vec_xl_be(off, s);
+# else
+	return (uint64x2_p8)vec_xl(off, s);
+# endif
+#else
+	/* GCC, Clang, etc */
+# if defined(IS_LITTLE_ENDIAN)
+	const uint8x16_p8 mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
+	const uint8x16_p8 zero = {0};
+	return (uint64x2_p8)vec_perm(vec_vsx_ld(off, src), zero, mask);
+# else
+	return (uint64x2_p8)vec_vsx_ld(off, src);
+# endif
+#endif
+}
+
 void Store64x2(const uint64x2_p8 src, uint8_t dest[16])
 {
 #if defined(CRYPTOPP_XLC_VERSION)
@@ -894,6 +932,15 @@ inline VectorType VectorLoad(const byte src[16])
 #endif
 }
 
+inline VectorType VectorLoad(int off, const byte src[16])
+{
+#if defined(CRYPTOPP_XLC_VERSION)
+	return Load8x16(off, src);
+#elif defined(CRYPTOPP_GCC_VERSION)
+	return Load64x2(off, src);
+#endif
+}
+
 inline VectorType VectorLoadAligned(const byte vec[16])
 {
 	return (VectorType)vec_ld(0, vec);
@@ -973,64 +1020,238 @@ inline T1 VectorDecryptLast(const T1& state, const T2& key)
 #endif
 }
 
-//////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
 
-void Rijndael_Enc_ProcessAndXorBlock_POWER8(const word32 *subkeys, size_t rounds,
-        const byte *inBlock, const byte *xorBlock, byte *outBlock)
+inline void POWER8_Enc_Block(VectorType &block, const word32 *subkeys, unsigned int rounds)
 {
 	CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
 	const byte *keys = reinterpret_cast<const byte*>(subkeys);
 
-	VectorType s = VectorLoad(inBlock);
 	VectorType k = VectorLoadAligned(keys);
+	block = VectorXor(block, k);
 
-	s = VectorXor(s, k);
 	for (size_t i=1; i<rounds-1; i+=2)
 	{
-		s = VectorEncrypt(s, VectorLoadAligned(  i*16,   keys));
-		s = VectorEncrypt(s, VectorLoadAligned((i+1)*16, keys));
+		block = VectorEncrypt(block, VectorLoadAligned(  i*16,   keys));
+		block = VectorEncrypt(block, VectorLoadAligned((i+1)*16, keys));
 	}
 
-	s = VectorEncrypt(s, VectorLoadAligned((rounds-1)*16, keys));
-	s = VectorEncryptLast(s, VectorLoadAligned(rounds*16, keys));
-
-	// According to benchmarks this is a tad bit slower
-	// if (xorBlock)
-	//	s = VectorXor(s, VectorLoad(xorBlock));
-
-	VectorType x = xorBlock ? VectorLoad(xorBlock) : (VectorType) {0};
-	s = VectorXor(s, x);
-
-	VectorStore(s, outBlock);
+	block = VectorEncrypt(block, VectorLoadAligned((rounds-1)*16, keys));
+	block = VectorEncryptLast(block, VectorLoadAligned(rounds*16, keys));
 }
 
-void Rijndael_Dec_ProcessAndXorBlock_POWER8(const word32 *subkeys, size_t rounds,
-        const byte *inBlock, const byte *xorBlock, byte *outBlock)
+inline void POWER8_Enc_4_Blocks(VectorType &block0, VectorType &block1, VectorType &block2,
+            VectorType &block3, const word32 *subkeys, unsigned int rounds)
+{
+	CRYPTOPP_ASSERT(subkeys);
+	const byte *keys = reinterpret_cast<const byte*>(subkeys);
+
+	VectorType k = VectorLoadAligned(keys);
+	block0 = VectorXor(block0, k);
+	block1 = VectorXor(block1, k);
+	block2 = VectorXor(block2, k);
+	block3 = VectorXor(block3, k);
+
+	for (size_t i=1; i<rounds; ++i)
+	{
+		k = VectorLoadAligned(i*16, keys);
+		block0 = VectorEncrypt(block0, k);
+		block1 = VectorEncrypt(block1, k);
+		block2 = VectorEncrypt(block2, k);
+		block3 = VectorEncrypt(block3, k);
+	}
+
+	k = VectorLoadAligned(rounds*16, keys);
+	block0 = VectorEncryptLast(block0, k);
+	block1 = VectorEncryptLast(block1, k);
+	block2 = VectorEncryptLast(block2, k);
+	block3 = VectorEncryptLast(block3, k);
+
+}
+
+inline void POWER8_Dec_Block(VectorType &block, const word32 *subkeys, unsigned int rounds)
 {
 	CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
 	const byte *keys = reinterpret_cast<const byte*>(subkeys);
 
-	VectorType s = VectorLoad(inBlock);
 	VectorType k = VectorLoadAligned(rounds*16, keys);
+	block = VectorXor(block, k);
 
-	s = VectorXor(s, k);
 	for (size_t i=rounds-1; i>1; i-=2)
 	{
-		s = VectorDecrypt(s, VectorLoadAligned(  i*16,   keys));
-		s = VectorDecrypt(s, VectorLoadAligned((i-1)*16, keys));
+		block = VectorDecrypt(block, VectorLoadAligned(  i*16,   keys));
+		block = VectorDecrypt(block, VectorLoadAligned((i-1)*16, keys));
 	}
 
-	s = VectorDecrypt(s, VectorLoadAligned(16, keys));
-	s = VectorDecryptLast(s, VectorLoadAligned(0, keys));
-
-	// According to benchmarks this is a tad bit slower
-	// if (xorBlock)
-	//	s = VectorXor(s, VectorLoad(xorBlock));
-
-	VectorType x = xorBlock ? VectorLoad(xorBlock) : (VectorType) {0};
-	s = VectorXor(s, x);
-
-	VectorStore(s, outBlock);
+	block = VectorDecrypt(block, VectorLoadAligned(16, keys));
+	block = VectorDecryptLast(block, VectorLoadAligned(0, keys));
 }
+
+inline void POWER8_Dec_4_Blocks(VectorType &block0, VectorType &block1, VectorType &block2,
+            VectorType &block3, const word32 *subkeys, unsigned int rounds)
+{
+	CRYPTOPP_ASSERT(subkeys);
+	const byte *keys = reinterpret_cast<const byte*>(subkeys);
+
+	VectorType k = VectorLoadAligned(rounds*16, keys);
+	block0 = VectorXor(block0, k);
+	block1 = VectorXor(block1, k);
+	block2 = VectorXor(block2, k);
+	block3 = VectorXor(block3, k);
+
+	for (size_t i=rounds-1; i>0; --i)
+	{
+		k = VectorLoadAligned(i*16, keys);
+		block0 = VectorDecrypt(block0, k);
+		block1 = VectorDecrypt(block1, k);
+		block2 = VectorDecrypt(block2, k);
+		block3 = VectorDecrypt(block3, k);
+	}
+
+	k = VectorLoadAligned(0, keys);
+	block0 = VectorDecryptLast(block0, k);
+	block1 = VectorDecryptLast(block1, k);
+	block2 = VectorDecryptLast(block2, k);
+	block3 = VectorDecryptLast(block3, k);
+}
+
+template <typename F1, typename F4>
+size_t Rijndael_AdvancedProcessBlocks_POWER8(F1 func1, F4 func4, const word32 *subKeys, size_t rounds,
+            const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
+{
+	CRYPTOPP_ASSERT(subKeys);
+	CRYPTOPP_ASSERT(inBlocks);
+	CRYPTOPP_ASSERT(outBlocks);
+	CRYPTOPP_ASSERT(length >= 16);
+
+	const size_t blockSize = 16;
+	size_t inIncrement = (flags & (BlockTransformation::BT_InBlockIsCounter|BlockTransformation::BT_DontIncrementInOutPointers)) ? 0 : blockSize;
+	size_t xorIncrement = xorBlocks ? blockSize : 0;
+	size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : blockSize;
+
+	if (flags & BlockTransformation::BT_ReverseDirection)
+	{
+		inBlocks += length - blockSize;
+		xorBlocks += length - blockSize;
+		outBlocks += length - blockSize;
+		inIncrement = 0-inIncrement;
+		xorIncrement = 0-xorIncrement;
+		outIncrement = 0-outIncrement;
+	}
+
+	if (flags & BlockTransformation::BT_AllowParallel)
+	{
+		while (length >= 4*blockSize)
+		{
+			VectorType block0, block1, block2, block3, temp;
+			block0 = VectorLoad(inBlocks);
+
+			if (flags & BlockTransformation::BT_InBlockIsCounter)
+			{
+#if defined(IS_LITTLE_ENDIAN)
+				const VectorType one = {1};
+#else
+				const VectorType one = (VectorType)(uint64x2_p8){0,1};
+#endif
+				block1 = VectorAdd(block0, one);
+				block2 = VectorAdd(block1, one);
+				block3 = VectorAdd(block2, one);
+				temp   = VectorAdd(block3, one);
+				VectorStore(temp, const_cast<byte*>(inBlocks));
+			}
+			else
+			{
+				//inBlocks += inIncrement;
+				block1 = VectorLoad(1*inIncrement, inBlocks);
+				//inBlocks += inIncrement;
+				block2 = VectorLoad(2*inIncrement, inBlocks);
+				//inBlocks += inIncrement;
+				block3 = VectorLoad(3*inIncrement, inBlocks);
+				//inBlocks += inIncrement;
+				inBlocks += 4*inIncrement;
+			}
+
+			if (flags & BlockTransformation::BT_XorInput)
+			{
+				block0 = VectorXor(block0, VectorLoad(0*xorIncrement, xorBlocks));
+				//xorBlocks += xorIncrement;
+				block1 = VectorXor(block1, VectorLoad(1*xorIncrement, xorBlocks));
+				//xorBlocks += xorIncrement;
+				block2 = VectorXor(block2, VectorLoad(2*xorIncrement, xorBlocks));
+				//xorBlocks += xorIncrement;
+				block3 = VectorXor(block3, VectorLoad(3*xorIncrement, xorBlocks));
+				//xorBlocks += xorIncrement;
+				xorBlocks += 4*xorIncrement;
+			}
+
+			func4(block0, block1, block2, block3, subKeys, rounds);
+
+			if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
+			{
+				block0 = VectorXor(block0, VectorLoad(0*xorIncrement, xorBlocks));
+				//xorBlocks += xorIncrement;
+				block1 = VectorXor(block1, VectorLoad(1*xorIncrement, xorBlocks));
+				//xorBlocks += xorIncrement;
+				block2 = VectorXor(block2, VectorLoad(2*xorIncrement, xorBlocks));
+				//xorBlocks += xorIncrement;
+				block3 = VectorXor(block3, VectorLoad(3*xorIncrement, xorBlocks));
+				//xorBlocks += xorIncrement;
+				xorBlocks += 4*xorIncrement;
+			}
+
+			// I can't get Store to run faster using indexed offsets
+			VectorStore(block0, outBlocks);
+			outBlocks += outIncrement;
+			VectorStore(block1, outBlocks);
+			outBlocks += outIncrement;
+			VectorStore(block2, outBlocks);
+			outBlocks += outIncrement;
+			VectorStore(block3, outBlocks);
+			outBlocks += outIncrement;
+
+			length -= 4*blockSize;
+		}
+	}
+
+	while (length >= blockSize)
+	{
+		VectorType block = VectorLoad(inBlocks);
+
+		if (flags & BlockTransformation::BT_XorInput)
+			block = VectorXor(block, VectorLoad(xorBlocks));
+
+		if (flags & BlockTransformation::BT_InBlockIsCounter)
+			const_cast<byte *>(inBlocks)[15]++;
+
+		func1(block, subKeys, rounds);
+
+		if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
+			block = VectorXor(block, VectorLoad(xorBlocks));
+
+		VectorStore(block, outBlocks);
+
+		inBlocks += inIncrement;
+		outBlocks += outIncrement;
+		xorBlocks += xorIncrement;
+		length -= blockSize;
+	}
+
+	return length;
+}
+
+size_t Rijndael_Enc_AdvancedProcessBlocks_POWER8(const word32 *subKeys, size_t rounds,
+            const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
+{
+	return Rijndael_AdvancedProcessBlocks_POWER8(POWER8_Enc_Block, POWER8_Enc_4_Blocks,
+            subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
+}
+
+size_t Rijndael_Dec_AdvancedProcessBlocks_POWER8(const word32 *subKeys, size_t rounds,
+            const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
+{
+	return Rijndael_AdvancedProcessBlocks_POWER8(POWER8_Dec_Block, POWER8_Dec_4_Blocks,
+            subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
+}
+
 #endif  // CRYPTOPP_POWER8_AES_AVAILABLE
 NAMESPACE_END
diff --git a/rijndael.cpp b/rijndael.cpp
index 3e016d6f..43c7fa80 100644
--- a/rijndael.cpp
+++ b/rijndael.cpp
@@ -253,10 +253,10 @@ extern size_t Rijndael_Dec_AdvancedProcessBlocks_ARMV8(const word32 *subkeys, si
 #if (CRYPTOPP_POWER8_AES_AVAILABLE)
 extern void ByteReverseArrayLE(byte src[16]);
 
-extern void Rijndael_Enc_ProcessAndXorBlock_POWER8(const word32 *subkeys, size_t rounds,
-        const byte *inBlock, const byte *xorBlock, byte *outBlock);
-extern void Rijndael_Dec_ProcessAndXorBlock_POWER8(const word32 *subkeys, size_t rounds,
-        const byte *inBlock, const byte *xorBlock, byte *outBlock);
+extern size_t Rijndael_Enc_AdvancedProcessBlocks_POWER8(const word32 *subkeys, size_t rounds,
+        const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
+extern size_t Rijndael_Dec_AdvancedProcessBlocks_POWER8(const word32 *subkeys, size_t rounds,
+        const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
 #endif
 
 void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLen, const NameValuePairs &)
@@ -408,7 +408,7 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
 #if (CRYPTOPP_POWER8_AES_AVAILABLE)
 	if (HasAES())
 	{
-		(void)Rijndael_Enc_ProcessAndXorBlock_POWER8(m_key, m_rounds, inBlock, xorBlock, outBlock);
+		(void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
 		return;
 	}
 #endif
@@ -502,7 +502,7 @@ void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
 #if (CRYPTOPP_POWER8_AES_AVAILABLE)
 	if (HasAES())
 	{
-		(void)Rijndael_Dec_ProcessAndXorBlock_POWER8(m_key, m_rounds, inBlock, xorBlock, outBlock);
+		(void)Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
 		return;
 	}
 #endif
@@ -1130,7 +1130,7 @@ Rijndael::Enc::Enc() : m_aliasBlock(s_sizeToAllocate) { }
 
 #endif  // CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
 
-#if CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64
+#if CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64 || CRYPTOPP_BOOL_PPC32 || CRYPTOPP_BOOL_PPC64
 // Do nothing
 Rijndael::Enc::Enc() { }
 #endif
@@ -1146,6 +1146,10 @@ size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xo
 	if (HasAES())
 		return Rijndael_Enc_AdvancedProcessBlocks_ARMV8(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
 #endif
+#if CRYPTOPP_POWER8_AES_AVAILABLE
+	if (HasAES())
+		return Rijndael_Enc_AdvancedProcessBlocks_POWER8(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
+#endif
 
 #if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
 	if (HasSSE2())
@@ -1205,11 +1209,14 @@ size_t Rijndael::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xo
 	if (HasAESNI())
 		return Rijndael_Dec_AdvancedProcessBlocks_AESNI(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
 #endif
-
 #if CRYPTOPP_ARM_AES_AVAILABLE
 	if (HasAES())
 		return Rijndael_Dec_AdvancedProcessBlocks_ARMV8(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
 #endif
+#if CRYPTOPP_POWER8_AES_AVAILABLE
+	if (HasAES())
+		return Rijndael_Dec_AdvancedProcessBlocks_POWER8(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
+#endif
 
 	return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
 }
diff --git a/rijndael.h b/rijndael.h
index 483eab5a..b6cd0faf 100644
--- a/rijndael.h
+++ b/rijndael.h
@@ -12,7 +12,8 @@
 #include "seckey.h"
 #include "secblock.h"
 
-#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64
+#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_ARM32 || \
+	CRYPTOPP_BOOL_ARM64 || CRYPTOPP_BOOL_PPC32 || CRYPTOPP_BOOL_PPC64
 # define CRYPTOPP_ENABLE_ADVANCED_PROCESS_BLOCKS 1
 #endif