From 58890ff053964fb0459c168e21c5ab3d3cee838f Mon Sep 17 00:00:00 2001
From: Jeffrey Walton <noloader@gmail.com>
Date: Thu, 14 Sep 2017 16:07:21 -0400
Subject: [PATCH] Use 6x blocks for Power8 AES rather than 4x Perforamnce
 increased for all modes when performing 6x vs 4x. 8x and 12x performed worse.

Here are the numbers:
4x Blocks:

<TR><TH>AES/CTR (128-bit key)<TD>1563<TD>2.1<TD>0.409<TD>1392
<TR><TH>AES/CTR (192-bit key)<TD>1403<TD>2.3<TD>0.450<TD>1529
<TR><TH>AES/CTR (256-bit key)<TD>1280<TD>2.5<TD>0.482<TD>1639
<TR><TH>AES/CBC (128-bit key)<TD>582<TD>5.6<TD>0.359<TD>1222
<TR><TH>AES/CBC (192-bit key)<TD>517<TD>6.3<TD>0.394<TD>1339
<TR><TH>AES/CBC (256-bit key)<TD>474<TD>6.8<TD>0.432<TD>1469
<TR><TH>AES/OFB (128-bit key)<TD>533<TD>6.1<TD>0.402<TD>1368
<TR><TH>AES/CFB (128-bit key)<TD>563<TD>5.8<TD>0.461<TD>1568
<TR><TH>AES/ECB (128-bit key)<TD>1829<TD>1.8<TD>0.240<TD>817

6x Blocks:

<TR><TH>AES/CTR (128-bit key)<TD>1750<TD>1.7<TD>0.406<TD>1300
<TR><TH>AES/CTR (192-bit key)<TD>1638<TD>1.9<TD>0.447<TD>1432
<TR><TH>AES/CTR (256-bit key)<TD>1528<TD>2.0<TD>0.482<TD>1541
<TR><TH>AES/CBC (128-bit key)<TD>582<TD>5.2<TD>0.358<TD>1145
<TR><TH>AES/CBC (192-bit key)<TD>517<TD>5.9<TD>0.394<TD>1260
<TR><TH>AES/CBC (256-bit key)<TD>474<TD>6.4<TD>0.431<TD>1379
<TR><TH>AES/OFB (128-bit key)<TD>533<TD>5.7<TD>0.400<TD>1281
<TR><TH>AES/CFB (128-bit key)<TD>563<TD>5.4<TD>0.461<TD>1476
<TR><TH>AES/ECB (128-bit key)<TD>1950<TD>1.6<TD>0.238<TD>763
---
 rijndael-simd.cpp | 77 +++++++++++++++++++++++++++++++----------------
 1 file changed, 51 insertions(+), 26 deletions(-)
diff --git a/rijndael-simd.cpp b/rijndael-simd.cpp
index be86039f..282f0ddb 100644
--- a/rijndael-simd.cpp
+++ b/rijndael-simd.cpp
@@ -1004,8 +1004,9 @@ inline void POWER8_Enc_Block(VectorType &block, const word32 *subkeys, unsigned
 	block = VectorEncryptLast(block, VectorLoadKey(rounds*16, keys));
 }
 
-inline void POWER8_Enc_4_Blocks(VectorType &block0, VectorType &block1, VectorType &block2,
-            VectorType &block3, const word32 *subkeys, unsigned int rounds)
+inline void POWER8_Enc_6_Blocks(VectorType &block0, VectorType &block1,
+            VectorType &block2, VectorType &block3, VectorType &block4,
+            VectorType &block5, const word32 *subkeys, unsigned int rounds)
 {
 	CRYPTOPP_ASSERT(subkeys);
 	const byte *keys = reinterpret_cast<const byte*>(subkeys);
@@ -1015,6 +1016,8 @@ inline void POWER8_Enc_4_Blocks(VectorType &block0, VectorType &block1, VectorTy
 	block1 = VectorXor(block1, k);
 	block2 = VectorXor(block2, k);
 	block3 = VectorXor(block3, k);
+	block4 = VectorXor(block4, k);
+	block5 = VectorXor(block5, k);
 
 	for (size_t i=1; i<rounds; ++i)
 	{
@@ -1023,6 +1026,8 @@ inline void POWER8_Enc_4_Blocks(VectorType &block0, VectorType &block1, VectorTy
 		block1 = VectorEncrypt(block1, k);
 		block2 = VectorEncrypt(block2, k);
 		block3 = VectorEncrypt(block3, k);
+		block4 = VectorEncrypt(block4, k);
+		block5 = VectorEncrypt(block5, k);
 	}
 
 	k = VectorLoadKey(rounds*16, keys);
@@ -1030,6 +1035,8 @@ inline void POWER8_Enc_4_Blocks(VectorType &block0, VectorType &block1, VectorTy
 	block1 = VectorEncryptLast(block1, k);
 	block2 = VectorEncryptLast(block2, k);
 	block3 = VectorEncryptLast(block3, k);
+	block4 = VectorEncryptLast(block4, k);
+	block5 = VectorEncryptLast(block5, k);
 }
 
 inline void POWER8_Dec_Block(VectorType &block, const word32 *subkeys, unsigned int rounds)
@@ -1050,8 +1057,9 @@ inline void POWER8_Dec_Block(VectorType &block, const word32 *subkeys, unsigned
 	block = VectorDecryptLast(block, VectorLoadKey(0, keys));
 }
 
-inline void POWER8_Dec_4_Blocks(VectorType &block0, VectorType &block1, VectorType &block2,
-            VectorType &block3, const word32 *subkeys, unsigned int rounds)
+inline void POWER8_Dec_6_Blocks(VectorType &block0, VectorType &block1,
+            VectorType &block2, VectorType &block3, VectorType &block4,
+            VectorType &block5, const word32 *subkeys, unsigned int rounds)
 {
 	CRYPTOPP_ASSERT(subkeys);
 	const byte *keys = reinterpret_cast<const byte*>(subkeys);
@@ -1061,6 +1069,8 @@ inline void POWER8_Dec_4_Blocks(VectorType &block0, VectorType &block1, VectorTy
 	block1 = VectorXor(block1, k);
 	block2 = VectorXor(block2, k);
 	block3 = VectorXor(block3, k);
+	block4 = VectorXor(block4, k);
+	block5 = VectorXor(block5, k);
 
 	for (size_t i=rounds-1; i>0; --i)
 	{
@@ -1069,6 +1079,8 @@ inline void POWER8_Dec_4_Blocks(VectorType &block0, VectorType &block1, VectorTy
 		block1 = VectorDecrypt(block1, k);
 		block2 = VectorDecrypt(block2, k);
 		block3 = VectorDecrypt(block3, k);
+		block4 = VectorDecrypt(block4, k);
+		block5 = VectorDecrypt(block5, k);
 	}
 
 	k = VectorLoadKey(0, keys);
@@ -1076,10 +1088,12 @@ inline void POWER8_Dec_4_Blocks(VectorType &block0, VectorType &block1, VectorTy
 	block1 = VectorDecryptLast(block1, k);
 	block2 = VectorDecryptLast(block2, k);
 	block3 = VectorDecryptLast(block3, k);
+	block4 = VectorDecryptLast(block4, k);
+	block5 = VectorDecryptLast(block5, k);
 }
 
-template <typename F1, typename F4>
-size_t Rijndael_AdvancedProcessBlocks_POWER8(F1 func1, F4 func4, const word32 *subKeys, size_t rounds,
+template <typename F1, typename F6>
+size_t Rijndael_AdvancedProcessBlocks_POWER8(F1 func1, F6 func6, const word32 *subKeys, size_t rounds,
             const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
 {
 	CRYPTOPP_ASSERT(subKeys);
@@ -1104,22 +1118,25 @@ size_t Rijndael_AdvancedProcessBlocks_POWER8(F1 func1, F4 func4, const word32 *s
 
 	if (flags & BlockTransformation::BT_AllowParallel)
 	{
-		while (length >= 4*blockSize)
+		while (length >= 6*blockSize)
 		{
-			VectorType block0, block1, block2, block3, temp;
+#if defined(IS_LITTLE_ENDIAN)
+			const VectorType one = (VectorType)((uint64x2_p8){1,0});
+#else
+			const VectorType one = (VectorType)((uint64x2_p8){0,1});
+#endif
+
+			VectorType block0, block1, block2, block3, block4, block5, temp;
 			block0 = VectorLoad(inBlocks);
 
 			if (flags & BlockTransformation::BT_InBlockIsCounter)
 			{
-#if defined(IS_LITTLE_ENDIAN)
-				const VectorType one = (VectorType)((uint64x2_p8){1,0});
-#else
-				const VectorType one = (VectorType)((uint64x2_p8){0,1});
-#endif
 				block1 = VectorAdd(block0, one);
 				block2 = VectorAdd(block1, one);
 				block3 = VectorAdd(block2, one);
-				temp   = VectorAdd(block3, one);
+				block4 = VectorAdd(block3, one);
+				block5 = VectorAdd(block4, one);
+				temp   = VectorAdd(block5, one);
 				VectorStore(temp, const_cast<byte*>(inBlocks));
 			}
 			else
@@ -1128,7 +1145,9 @@ size_t Rijndael_AdvancedProcessBlocks_POWER8(F1 func1, F4 func4, const word32 *s
 				block1 = VectorLoad(1*inc, inBlocks);
 				block2 = VectorLoad(2*inc, inBlocks);
 				block3 = VectorLoad(3*inc, inBlocks);
-				inBlocks += 4*inIncrement;
+				block4 = VectorLoad(4*inc, inBlocks);
+				block5 = VectorLoad(5*inc, inBlocks);
+				inBlocks += 6*inIncrement;
 			}
 
 			if (flags & BlockTransformation::BT_XorInput)
@@ -1138,10 +1157,12 @@ size_t Rijndael_AdvancedProcessBlocks_POWER8(F1 func1, F4 func4, const word32 *s
 				block1 = VectorXor(block1, VectorLoad(1*inc, xorBlocks));
 				block2 = VectorXor(block2, VectorLoad(2*inc, xorBlocks));
 				block3 = VectorXor(block3, VectorLoad(3*inc, xorBlocks));
-				xorBlocks += 4*inc;
+				block4 = VectorXor(block4, VectorLoad(4*inc, xorBlocks));
+				block5 = VectorXor(block5, VectorLoad(5*inc, xorBlocks));
+				xorBlocks += 6*inc;
 			}
 
-			func4(block0, block1, block2, block3, subKeys, rounds);
+			func6(block0, block1, block2, block3, block4, block5, subKeys, rounds);
 
 			if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
 			{
@@ -1150,7 +1171,9 @@ size_t Rijndael_AdvancedProcessBlocks_POWER8(F1 func1, F4 func4, const word32 *s
 				block1 = VectorXor(block1, VectorLoad(1*inc, xorBlocks));
 				block2 = VectorXor(block2, VectorLoad(2*inc, xorBlocks));
 				block3 = VectorXor(block3, VectorLoad(3*inc, xorBlocks));
-				xorBlocks += 4*inc;
+				block4 = VectorXor(block4, VectorLoad(4*inc, xorBlocks));
+				block5 = VectorXor(block5, VectorLoad(5*inc, xorBlocks));
+				xorBlocks += 6*inc;
 			}
 
 			const int inc = static_cast<int>(outIncrement);
@@ -1158,9 +1181,11 @@ size_t Rijndael_AdvancedProcessBlocks_POWER8(F1 func1, F4 func4, const word32 *s
 			VectorStore(block1, outBlocks+1*inc);
 			VectorStore(block2, outBlocks+2*inc);
 			VectorStore(block3, outBlocks+3*inc);
+			VectorStore(block4, outBlocks+4*inc);
+			VectorStore(block5, outBlocks+5*inc);
 
-			outBlocks += 4*inc;
-			length -= 4*blockSize;
+			outBlocks += 6*inc;
+			length -= 6*blockSize;
 		}
 	}
 
@@ -1191,17 +1216,17 @@ size_t Rijndael_AdvancedProcessBlocks_POWER8(F1 func1, F4 func4, const word32 *s
 }
 
 size_t Rijndael_Enc_AdvancedProcessBlocks_POWER8(const word32 *subKeys, size_t rounds,
-            const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
+			const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
 {
-	return Rijndael_AdvancedProcessBlocks_POWER8(POWER8_Enc_Block, POWER8_Enc_4_Blocks,
-            subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
+	return Rijndael_AdvancedProcessBlocks_POWER8(POWER8_Enc_Block, POWER8_Enc_6_Blocks,
+		subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
 }
 
 size_t Rijndael_Dec_AdvancedProcessBlocks_POWER8(const word32 *subKeys, size_t rounds,
-            const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
+			const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
 {
-	return Rijndael_AdvancedProcessBlocks_POWER8(POWER8_Dec_Block, POWER8_Dec_4_Blocks,
-            subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
+	return Rijndael_AdvancedProcessBlocks_POWER8(POWER8_Dec_Block, POWER8_Dec_6_Blocks,
+		subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
 }
 
 #endif  // CRYPTOPP_POWER8_AES_AVAILABLE