Add AdvancedProcessBlocks for Power8

This increases performance to about 1.6 cpb. We are about 0.5 cpb behind Botan, and about 1.0 cpb behind OpenSSL. However, it beats the snot out of C/C++, which runs at 20 to 30 cpb
2017-09-12 18:15:55 -04:00 · 2017-09-12 18:15:55 -04:00 · 6899d3f8bb
parent 2ebd30d43c
commit 6899d3f8bb
3 changed files with 273 additions and 44 deletions
--- a/rijndael-simd.cpp
+++ b/rijndael-simd.cpp
@ -817,6 +817,22 @@ uint8x16_p8 Load8x16(const uint8_t src[16])
 #endif
 }

+uint8x16_p8 Load8x16(int off, const uint8_t src[16])
+{
+#if defined(CRYPTOPP_XLC_VERSION)
+	/* http://stackoverflow.com/q/46124383/608639 */
+	uint8_t* s = (uint8_t*)src;
+# if defined(IS_LITTLE_ENDIAN)
+	return vec_xl_be(off, s);
+# else
+	return vec_xl(off, s);
+# endif
+#else
+	/* GCC, Clang, etc */
+	return (uint8x16_p8)vec_vsx_ld(off, src);
+#endif
+}
+
 void Store8x16(const uint8x16_p8 src, uint8_t dest[16])
 {
 #if defined(CRYPTOPP_XLC_VERSION)
@ -854,6 +870,28 @@ uint64x2_p8 Load64x2(const uint8_t src[16])
 #endif
 }

+uint64x2_p8 Load64x2(int off, const uint8_t src[16])
+{
+#if defined(CRYPTOPP_XLC_VERSION)
+	/* http://stackoverflow.com/q/46124383/608639 */
+	uint8_t* s = (uint8_t*)src;
+# if defined(IS_LITTLE_ENDIAN)
+	return (uint64x2_p8)vec_xl_be(off, s);
+# else
+	return (uint64x2_p8)vec_xl(off, s);
+# endif
+#else
+	/* GCC, Clang, etc */
+# if defined(IS_LITTLE_ENDIAN)
+	const uint8x16_p8 mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
+	const uint8x16_p8 zero = {0};
+	return (uint64x2_p8)vec_perm(vec_vsx_ld(off, src), zero, mask);
+# else
+	return (uint64x2_p8)vec_vsx_ld(off, src);
+# endif
+#endif
+}
+
 void Store64x2(const uint64x2_p8 src, uint8_t dest[16])
 {
 #if defined(CRYPTOPP_XLC_VERSION)
@ -894,6 +932,15 @@ inline VectorType VectorLoad(const byte src[16])
 #endif
 }

+inline VectorType VectorLoad(int off, const byte src[16])
+{
+#if defined(CRYPTOPP_XLC_VERSION)
+	return Load8x16(off, src);
+#elif defined(CRYPTOPP_GCC_VERSION)
+	return Load64x2(off, src);
+#endif
+}
+
 inline VectorType VectorLoadAligned(const byte vec[16])
 {
 	return (VectorType)vec_ld(0, vec);
@ -973,64 +1020,238 @@ inline T1 VectorDecryptLast(const T1& state, const T2& key)
 #endif
 }

-//////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////

-void Rijndael_Enc_ProcessAndXorBlock_POWER8(const word32 *subkeys, size_t rounds,
-        const byte *inBlock, const byte *xorBlock, byte *outBlock)
+inline void POWER8_Enc_Block(VectorType &block, const word32 *subkeys, unsigned int rounds)
 {
 	CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
 	const byte *keys = reinterpret_cast<const byte*>(subkeys);

-	VectorType s = VectorLoad(inBlock);
 	VectorType k = VectorLoadAligned(keys);
+	block = VectorXor(block, k);

-	s = VectorXor(s, k);
 	for (size_t i=1; i<rounds-1; i+=2)
 	{
-		s = VectorEncrypt(s, VectorLoadAligned(  i*16,   keys));
-		s = VectorEncrypt(s, VectorLoadAligned((i+1)*16, keys));
+		block = VectorEncrypt(block, VectorLoadAligned(  i*16,   keys));
+		block = VectorEncrypt(block, VectorLoadAligned((i+1)*16, keys));
 	}

-	s = VectorEncrypt(s, VectorLoadAligned((rounds-1)*16, keys));
-	s = VectorEncryptLast(s, VectorLoadAligned(rounds*16, keys));
-
-	// According to benchmarks this is a tad bit slower
-	// if (xorBlock)
-	//	s = VectorXor(s, VectorLoad(xorBlock));
-
-	VectorType x = xorBlock ? VectorLoad(xorBlock) : (VectorType) {0};
-	s = VectorXor(s, x);
-
-	VectorStore(s, outBlock);
+	block = VectorEncrypt(block, VectorLoadAligned((rounds-1)*16, keys));
+	block = VectorEncryptLast(block, VectorLoadAligned(rounds*16, keys));
 }

-void Rijndael_Dec_ProcessAndXorBlock_POWER8(const word32 *subkeys, size_t rounds,
-        const byte *inBlock, const byte *xorBlock, byte *outBlock)
+inline void POWER8_Enc_4_Blocks(VectorType &block0, VectorType &block1, VectorType &block2,
+            VectorType &block3, const word32 *subkeys, unsigned int rounds)
+{
+	CRYPTOPP_ASSERT(subkeys);
+	const byte *keys = reinterpret_cast<const byte*>(subkeys);
+
+	VectorType k = VectorLoadAligned(keys);
+	block0 = VectorXor(block0, k);
+	block1 = VectorXor(block1, k);
+	block2 = VectorXor(block2, k);
+	block3 = VectorXor(block3, k);
+
+	for (size_t i=1; i<rounds; ++i)
+	{
+		k = VectorLoadAligned(i*16, keys);
+		block0 = VectorEncrypt(block0, k);
+		block1 = VectorEncrypt(block1, k);
+		block2 = VectorEncrypt(block2, k);
+		block3 = VectorEncrypt(block3, k);
+	}
+
+	k = VectorLoadAligned(rounds*16, keys);
+	block0 = VectorEncryptLast(block0, k);
+	block1 = VectorEncryptLast(block1, k);
+	block2 = VectorEncryptLast(block2, k);
+	block3 = VectorEncryptLast(block3, k);
+
+}
+
+inline void POWER8_Dec_Block(VectorType &block, const word32 *subkeys, unsigned int rounds)
 {
 	CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
 	const byte *keys = reinterpret_cast<const byte*>(subkeys);

-	VectorType s = VectorLoad(inBlock);
 	VectorType k = VectorLoadAligned(rounds*16, keys);
+	block = VectorXor(block, k);

-	s = VectorXor(s, k);
 	for (size_t i=rounds-1; i>1; i-=2)
 	{
-		s = VectorDecrypt(s, VectorLoadAligned(  i*16,   keys));
-		s = VectorDecrypt(s, VectorLoadAligned((i-1)*16, keys));
+		block = VectorDecrypt(block, VectorLoadAligned(  i*16,   keys));
+		block = VectorDecrypt(block, VectorLoadAligned((i-1)*16, keys));
 	}

-	s = VectorDecrypt(s, VectorLoadAligned(16, keys));
-	s = VectorDecryptLast(s, VectorLoadAligned(0, keys));
-
-	// According to benchmarks this is a tad bit slower
-	// if (xorBlock)
-	//	s = VectorXor(s, VectorLoad(xorBlock));
-
-	VectorType x = xorBlock ? VectorLoad(xorBlock) : (VectorType) {0};
-	s = VectorXor(s, x);
-
-	VectorStore(s, outBlock);
+	block = VectorDecrypt(block, VectorLoadAligned(16, keys));
+	block = VectorDecryptLast(block, VectorLoadAligned(0, keys));
 }
+
+inline void POWER8_Dec_4_Blocks(VectorType &block0, VectorType &block1, VectorType &block2,
+            VectorType &block3, const word32 *subkeys, unsigned int rounds)
+{
+	CRYPTOPP_ASSERT(subkeys);
+	const byte *keys = reinterpret_cast<const byte*>(subkeys);
+
+	VectorType k = VectorLoadAligned(rounds*16, keys);
+	block0 = VectorXor(block0, k);
+	block1 = VectorXor(block1, k);
+	block2 = VectorXor(block2, k);
+	block3 = VectorXor(block3, k);
+
+	for (size_t i=rounds-1; i>0; --i)
+	{
+		k = VectorLoadAligned(i*16, keys);
+		block0 = VectorDecrypt(block0, k);
+		block1 = VectorDecrypt(block1, k);
+		block2 = VectorDecrypt(block2, k);
+		block3 = VectorDecrypt(block3, k);
+	}
+
+	k = VectorLoadAligned(0, keys);
+	block0 = VectorDecryptLast(block0, k);
+	block1 = VectorDecryptLast(block1, k);
+	block2 = VectorDecryptLast(block2, k);
+	block3 = VectorDecryptLast(block3, k);
+}
+
+template <typename F1, typename F4>
+size_t Rijndael_AdvancedProcessBlocks_POWER8(F1 func1, F4 func4, const word32 *subKeys, size_t rounds,
+            const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
+{
+	CRYPTOPP_ASSERT(subKeys);
+	CRYPTOPP_ASSERT(inBlocks);
+	CRYPTOPP_ASSERT(outBlocks);
+	CRYPTOPP_ASSERT(length >= 16);
+
+	const size_t blockSize = 16;
+	size_t inIncrement = (flags & (BlockTransformation::BT_InBlockIsCounter|BlockTransformation::BT_DontIncrementInOutPointers)) ? 0 : blockSize;
+	size_t xorIncrement = xorBlocks ? blockSize : 0;
+	size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : blockSize;
+
+	if (flags & BlockTransformation::BT_ReverseDirection)
+	{
+		inBlocks += length - blockSize;
+		xorBlocks += length - blockSize;
+		outBlocks += length - blockSize;
+		inIncrement = 0-inIncrement;
+		xorIncrement = 0-xorIncrement;
+		outIncrement = 0-outIncrement;
+	}
+
+	if (flags & BlockTransformation::BT_AllowParallel)
+	{
+		while (length >= 4*blockSize)
+		{
+			VectorType block0, block1, block2, block3, temp;
+			block0 = VectorLoad(inBlocks);
+
+			if (flags & BlockTransformation::BT_InBlockIsCounter)
+			{
+#if defined(IS_LITTLE_ENDIAN)
+				const VectorType one = {1};
+#else
+				const VectorType one = (VectorType)(uint64x2_p8){0,1};
+#endif
+				block1 = VectorAdd(block0, one);
+				block2 = VectorAdd(block1, one);
+				block3 = VectorAdd(block2, one);
+				temp   = VectorAdd(block3, one);
+				VectorStore(temp, const_cast<byte*>(inBlocks));
+			}
+			else
+			{
+				//inBlocks += inIncrement;
+				block1 = VectorLoad(1*inIncrement, inBlocks);
+				//inBlocks += inIncrement;
+				block2 = VectorLoad(2*inIncrement, inBlocks);
+				//inBlocks += inIncrement;
+				block3 = VectorLoad(3*inIncrement, inBlocks);
+				//inBlocks += inIncrement;
+				inBlocks += 4*inIncrement;
+			}
+
+			if (flags & BlockTransformation::BT_XorInput)
+			{
+				block0 = VectorXor(block0, VectorLoad(0*xorIncrement, xorBlocks));
+				//xorBlocks += xorIncrement;
+				block1 = VectorXor(block1, VectorLoad(1*xorIncrement, xorBlocks));
+				//xorBlocks += xorIncrement;
+				block2 = VectorXor(block2, VectorLoad(2*xorIncrement, xorBlocks));
+				//xorBlocks += xorIncrement;
+				block3 = VectorXor(block3, VectorLoad(3*xorIncrement, xorBlocks));
+				//xorBlocks += xorIncrement;
+				xorBlocks += 4*xorIncrement;
+			}
+
+			func4(block0, block1, block2, block3, subKeys, rounds);
+
+			if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
+			{
+				block0 = VectorXor(block0, VectorLoad(0*xorIncrement, xorBlocks));
+				//xorBlocks += xorIncrement;
+				block1 = VectorXor(block1, VectorLoad(1*xorIncrement, xorBlocks));
+				//xorBlocks += xorIncrement;
+				block2 = VectorXor(block2, VectorLoad(2*xorIncrement, xorBlocks));
+				//xorBlocks += xorIncrement;
+				block3 = VectorXor(block3, VectorLoad(3*xorIncrement, xorBlocks));
+				//xorBlocks += xorIncrement;
+				xorBlocks += 4*xorIncrement;
+			}
+
+			// I can't get Store to run faster using indexed offsets
+			VectorStore(block0, outBlocks);
+			outBlocks += outIncrement;
+			VectorStore(block1, outBlocks);
+			outBlocks += outIncrement;
+			VectorStore(block2, outBlocks);
+			outBlocks += outIncrement;
+			VectorStore(block3, outBlocks);
+			outBlocks += outIncrement;
+
+			length -= 4*blockSize;
+		}
+	}
+
+	while (length >= blockSize)
+	{
+		VectorType block = VectorLoad(inBlocks);
+
+		if (flags & BlockTransformation::BT_XorInput)
+			block = VectorXor(block, VectorLoad(xorBlocks));
+
+		if (flags & BlockTransformation::BT_InBlockIsCounter)
+			const_cast<byte *>(inBlocks)[15]++;
+
+		func1(block, subKeys, rounds);
+
+		if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
+			block = VectorXor(block, VectorLoad(xorBlocks));
+
+		VectorStore(block, outBlocks);
+
+		inBlocks += inIncrement;
+		outBlocks += outIncrement;
+		xorBlocks += xorIncrement;
+		length -= blockSize;
+	}
+
+	return length;
+}
+
+size_t Rijndael_Enc_AdvancedProcessBlocks_POWER8(const word32 *subKeys, size_t rounds,
+            const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
+{
+	return Rijndael_AdvancedProcessBlocks_POWER8(POWER8_Enc_Block, POWER8_Enc_4_Blocks,
+            subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
+}
+
+size_t Rijndael_Dec_AdvancedProcessBlocks_POWER8(const word32 *subKeys, size_t rounds,
+            const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
+{
+	return Rijndael_AdvancedProcessBlocks_POWER8(POWER8_Dec_Block, POWER8_Dec_4_Blocks,
+            subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
+}
+
 #endif  // CRYPTOPP_POWER8_AES_AVAILABLE
 NAMESPACE_END
--- a/rijndael.cpp
+++ b/rijndael.cpp
@ -253,10 +253,10 @@ extern size_t Rijndael_Dec_AdvancedProcessBlocks_ARMV8(const word32 *subkeys, si
 #if (CRYPTOPP_POWER8_AES_AVAILABLE)
 extern void ByteReverseArrayLE(byte src[16]);

-extern void Rijndael_Enc_ProcessAndXorBlock_POWER8(const word32 *subkeys, size_t rounds,
-        const byte *inBlock, const byte *xorBlock, byte *outBlock);
-extern void Rijndael_Dec_ProcessAndXorBlock_POWER8(const word32 *subkeys, size_t rounds,
-        const byte *inBlock, const byte *xorBlock, byte *outBlock);
+extern size_t Rijndael_Enc_AdvancedProcessBlocks_POWER8(const word32 *subkeys, size_t rounds,
+        const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
+extern size_t Rijndael_Dec_AdvancedProcessBlocks_POWER8(const word32 *subkeys, size_t rounds,
+        const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
 #endif

 void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLen, const NameValuePairs &)
@ -408,7 +408,7 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
 #if (CRYPTOPP_POWER8_AES_AVAILABLE)
 	if (HasAES())
 	{
-		(void)Rijndael_Enc_ProcessAndXorBlock_POWER8(m_key, m_rounds, inBlock, xorBlock, outBlock);
+		(void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
 		return;
 	}
 #endif
@ -502,7 +502,7 @@ void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
 #if (CRYPTOPP_POWER8_AES_AVAILABLE)
 	if (HasAES())
 	{
-		(void)Rijndael_Dec_ProcessAndXorBlock_POWER8(m_key, m_rounds, inBlock, xorBlock, outBlock);
+		(void)Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
 		return;
 	}
 #endif
@ -1130,7 +1130,7 @@ Rijndael::Enc::Enc() : m_aliasBlock(s_sizeToAllocate) { }

 #endif  // CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86

-#if CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64
+#if CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64 || CRYPTOPP_BOOL_PPC32 || CRYPTOPP_BOOL_PPC64
 // Do nothing
 Rijndael::Enc::Enc() { }
 #endif
@ -1146,6 +1146,10 @@ size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xo
 	if (HasAES())
 		return Rijndael_Enc_AdvancedProcessBlocks_ARMV8(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
 #endif
+#if CRYPTOPP_POWER8_AES_AVAILABLE
+	if (HasAES())
+		return Rijndael_Enc_AdvancedProcessBlocks_POWER8(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
+#endif

 #if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
 	if (HasSSE2())
@ -1205,11 +1209,14 @@ size_t Rijndael::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xo
 	if (HasAESNI())
 		return Rijndael_Dec_AdvancedProcessBlocks_AESNI(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
 #endif
-
 #if CRYPTOPP_ARM_AES_AVAILABLE
 	if (HasAES())
 		return Rijndael_Dec_AdvancedProcessBlocks_ARMV8(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
 #endif
+#if CRYPTOPP_POWER8_AES_AVAILABLE
+	if (HasAES())
+		return Rijndael_Dec_AdvancedProcessBlocks_POWER8(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
+#endif

 	return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
 }
--- a/rijndael.h
+++ b/rijndael.h
@ -12,7 +12,8 @@
 #include "seckey.h"
 #include "secblock.h"

-#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64
+#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_ARM32 || \
+	CRYPTOPP_BOOL_ARM64 || CRYPTOPP_BOOL_PPC32 || CRYPTOPP_BOOL_PPC64
 # define CRYPTOPP_ENABLE_ADVANCED_PROCESS_BLOCKS 1
 #endif