diff --git a/rijndael-simd.cpp b/rijndael-simd.cpp
index 0c624fa8..ff46e564 100644
--- a/rijndael-simd.cpp
+++ b/rijndael-simd.cpp
@@ -817,6 +817,22 @@ uint8x16_p8 Load8x16(const uint8_t src[16])
 #endif
 }
 
+uint8x16_p8 Load8x16(int off, const uint8_t src[16])
+{
+#if defined(CRYPTOPP_XLC_VERSION)
+	/* http://stackoverflow.com/q/46124383/608639 */
+	uint8_t* s = (uint8_t*)src;
+# if defined(IS_LITTLE_ENDIAN)
+	return vec_xl_be(off, s);
+# else
+	return vec_xl(off, s);
+# endif
+#else
+	/* GCC, Clang, etc */
+	return (uint8x16_p8)vec_vsx_ld(off, src);
+#endif
+}
+
 void Store8x16(const uint8x16_p8 src, uint8_t dest[16])
 {
 #if defined(CRYPTOPP_XLC_VERSION)
@@ -854,6 +870,28 @@ uint64x2_p8 Load64x2(const uint8_t src[16])
 #endif
 }
 
+uint64x2_p8 Load64x2(int off, const uint8_t src[16])
+{
+#if defined(CRYPTOPP_XLC_VERSION)
+	/* http://stackoverflow.com/q/46124383/608639 */
+	uint8_t* s = (uint8_t*)src;
+# if defined(IS_LITTLE_ENDIAN)
+	return (uint64x2_p8)vec_xl_be(off, s);
+# else
+	return (uint64x2_p8)vec_xl(off, s);
+# endif
+#else
+	/* GCC, Clang, etc */
+# if defined(IS_LITTLE_ENDIAN)
+	const uint8x16_p8 mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
+	const uint8x16_p8 zero = {0};
+	return (uint64x2_p8)vec_perm(vec_vsx_ld(off, src), zero, mask);
+# else
+	return (uint64x2_p8)vec_vsx_ld(off, src);
+# endif
+#endif
+}
+
 void Store64x2(const uint64x2_p8 src, uint8_t dest[16])
 {
 #if defined(CRYPTOPP_XLC_VERSION)
@@ -894,6 +932,15 @@ inline VectorType VectorLoad(const byte src[16])
 #endif
 }
 
+inline VectorType VectorLoad(int off, const byte src[16])
+{
+#if defined(CRYPTOPP_XLC_VERSION)
+	return Load8x16(off, src);
+#elif defined(CRYPTOPP_GCC_VERSION)
+	return Load64x2(off, src);
+#endif
+}
+
 inline VectorType VectorLoadAligned(const byte vec[16])
 {
 	return (VectorType)vec_ld(0, vec);
@@ -973,64 +1020,238 @@ inline T1 VectorDecryptLast(const T1& state, const T2& key)
 #endif
 }
 
-//////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
 
-void Rijndael_Enc_ProcessAndXorBlock_POWER8(const word32 *subkeys, size_t rounds,
-        const byte *inBlock, const byte *xorBlock, byte *outBlock)
+inline void POWER8_Enc_Block(VectorType &block, const word32 *subkeys, unsigned int rounds)
 {
 	CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
 	const byte *keys = reinterpret_cast<const byte*>(subkeys);
 
-	VectorType s = VectorLoad(inBlock);
 	VectorType k = VectorLoadAligned(keys);
+	block = VectorXor(block, k);
 
-	s = VectorXor(s, k);
 	for (size_t i=1; i<rounds-1; i+=2)
 	{
-		s = VectorEncrypt(s, VectorLoadAligned(  i*16,   keys));
-		s = VectorEncrypt(s, VectorLoadAligned((i+1)*16, keys));
+		block = VectorEncrypt(block, VectorLoadAligned(  i*16,   keys));
+		block = VectorEncrypt(block, VectorLoadAligned((i+1)*16, keys));
 	}
 
-	s = VectorEncrypt(s, VectorLoadAligned((rounds-1)*16, keys));
-	s = VectorEncryptLast(s, VectorLoadAligned(rounds*16, keys));
-
-	// According to benchmarks this is a tad bit slower
-	// if (xorBlock)
-	//	s = VectorXor(s, VectorLoad(xorBlock));
-
-	VectorType x = xorBlock ? VectorLoad(xorBlock) : (VectorType) {0};
-	s = VectorXor(s, x);
-
-	VectorStore(s, outBlock);
+	block = VectorEncrypt(block, VectorLoadAligned((rounds-1)*16, keys));
+	block = VectorEncryptLast(block, VectorLoadAligned(rounds*16, keys));
 }
 
-void Rijndael_Dec_ProcessAndXorBlock_POWER8(const word32 *subkeys, size_t rounds,
-        const byte *inBlock, const byte *xorBlock, byte *outBlock)
+inline void POWER8_Enc_4_Blocks(VectorType &block0, VectorType &block1, VectorType &block2,
+            VectorType &block3, const word32 *subkeys, unsigned int rounds)
+{
+	CRYPTOPP_ASSERT(subkeys);
+	const byte *keys = reinterpret_cast<const byte*>(subkeys);
+
+	VectorType k = VectorLoadAligned(keys);
+	block0 = VectorXor(block0, k);
+	block1 = VectorXor(block1, k);
+	block2 = VectorXor(block2, k);
+	block3 = VectorXor(block3, k);
+
+	for (size_t i=1; i<rounds; ++i)
+	{
+		k = VectorLoadAligned(i*16, keys);
+		block0 = VectorEncrypt(block0, k);
+		block1 = VectorEncrypt(block1, k);
+		block2 = VectorEncrypt(block2, k);
+		block3 = VectorEncrypt(block3, k);
+	}
+
+	k = VectorLoadAligned(rounds*16, keys);
+	block0 = VectorEncryptLast(block0, k);
+	block1 = VectorEncryptLast(block1, k);
+	block2 = VectorEncryptLast(block2, k);
+	block3 = VectorEncryptLast(block3, k);
+
+}
+
+inline void POWER8_Dec_Block(VectorType &block, const word32 *subkeys, unsigned int rounds)
 {
 	CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
 	const byte *keys = reinterpret_cast<const byte*>(subkeys);
 
-	VectorType s = VectorLoad(inBlock);
 	VectorType k = VectorLoadAligned(rounds*16, keys);
+	block = VectorXor(block, k);
 
-	s = VectorXor(s, k);
 	for (size_t i=rounds-1; i>1; i-=2)
 	{
-		s = VectorDecrypt(s, VectorLoadAligned(  i*16,   keys));
-		s = VectorDecrypt(s, VectorLoadAligned((i-1)*16, keys));
+		block = VectorDecrypt(block, VectorLoadAligned(  i*16,   keys));
+		block = VectorDecrypt(block, VectorLoadAligned((i-1)*16, keys));
 	}
 
-	s = VectorDecrypt(s, VectorLoadAligned(16, keys));
-	s = VectorDecryptLast(s, VectorLoadAligned(0, keys));
-
-	// According to benchmarks this is a tad bit slower
-	// if (xorBlock)
-	//	s = VectorXor(s, VectorLoad(xorBlock));
-
-	VectorType x = xorBlock ? VectorLoad(xorBlock) : (VectorType) {0};
-	s = VectorXor(s, x);
-
-	VectorStore(s, outBlock);
+	block = VectorDecrypt(block, VectorLoadAligned(16, keys));
+	block = VectorDecryptLast(block, VectorLoadAligned(0, keys));
 }
+
+inline void POWER8_Dec_4_Blocks(VectorType &block0, VectorType &block1, VectorType &block2,
+            VectorType &block3, const word32 *subkeys, unsigned int rounds)
+{
+	CRYPTOPP_ASSERT(subkeys);
+	const byte *keys = reinterpret_cast<const byte*>(subkeys);
+
+	VectorType k = VectorLoadAligned(rounds*16, keys);
+	block0 = VectorXor(block0, k);
+	block1 = VectorXor(block1, k);
+	block2 = VectorXor(block2, k);
+	block3 = VectorXor(block3, k);
+
+	for (size_t i=rounds-1; i>0; --i)
+	{
+		k = VectorLoadAligned(i*16, keys);
+		block0 = VectorDecrypt(block0, k);
+		block1 = VectorDecrypt(block1, k);
+		block2 = VectorDecrypt(block2, k);
+		block3 = VectorDecrypt(block3, k);
+	}
+
+	k = VectorLoadAligned(0, keys);
+	block0 = VectorDecryptLast(block0, k);
+	block1 = VectorDecryptLast(block1, k);
+	block2 = VectorDecryptLast(block2, k);
+	block3 = VectorDecryptLast(block3, k);
+}
+
+template <typename F1, typename F4>
+size_t Rijndael_AdvancedProcessBlocks_POWER8(F1 func1, F4 func4, const word32 *subKeys, size_t rounds,
+            const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
+{
+	CRYPTOPP_ASSERT(subKeys);
+	CRYPTOPP_ASSERT(inBlocks);
+	CRYPTOPP_ASSERT(outBlocks);
+	CRYPTOPP_ASSERT(length >= 16);
+
+	const size_t blockSize = 16;
+	size_t inIncrement = (flags & (BlockTransformation::BT_InBlockIsCounter|BlockTransformation::BT_DontIncrementInOutPointers)) ? 0 : blockSize;
+	size_t xorIncrement = xorBlocks ? blockSize : 0;
+	size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : blockSize;
+
+	if (flags & BlockTransformation::BT_ReverseDirection)
+	{
+		inBlocks += length - blockSize;
+		xorBlocks += length - blockSize;
+		outBlocks += length - blockSize;
+		inIncrement = 0-inIncrement;
+		xorIncrement = 0-xorIncrement;
+		outIncrement = 0-outIncrement;
+	}
+
+	if (flags & BlockTransformation::BT_AllowParallel)
+	{
+		while (length >= 4*blockSize)
+		{
+			VectorType block0, block1, block2, block3, temp;
+			block0 = VectorLoad(inBlocks);
+
+			if (flags & BlockTransformation::BT_InBlockIsCounter)
+			{
+#if defined(IS_LITTLE_ENDIAN)
+				const VectorType one = {1};
+#else
+				const VectorType one = (VectorType)(uint64x2_p8){0,1};
+#endif
+				block1 = VectorAdd(block0, one);
+				block2 = VectorAdd(block1, one);
+				block3 = VectorAdd(block2, one);
+				temp   = VectorAdd(block3, one);
+				VectorStore(temp, const_cast<byte*>(inBlocks));
+			}
+			else
+			{
+				//inBlocks += inIncrement;
+				block1 = VectorLoad(1*inIncrement, inBlocks);
+				//inBlocks += inIncrement;
+				block2 = VectorLoad(2*inIncrement, inBlocks);
+				//inBlocks += inIncrement;
+				block3 = VectorLoad(3*inIncrement, inBlocks);
+				//inBlocks += inIncrement;
+				inBlocks += 4*inIncrement;
+			}
+
+			if (flags & BlockTransformation::BT_XorInput)
+			{
+				block0 = VectorXor(block0, VectorLoad(0*xorIncrement, xorBlocks));
+				//xorBlocks += xorIncrement;
+				block1 = VectorXor(block1, VectorLoad(1*xorIncrement, xorBlocks));
+				//xorBlocks += xorIncrement;
+				block2 = VectorXor(block2, VectorLoad(2*xorIncrement, xorBlocks));
+				//xorBlocks += xorIncrement;
+				block3 = VectorXor(block3, VectorLoad(3*xorIncrement, xorBlocks));
+				//xorBlocks += xorIncrement;
+				xorBlocks += 4*xorIncrement;
+			}
+
+			func4(block0, block1, block2, block3, subKeys, rounds);
+
+			if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
+			{
+				block0 = VectorXor(block0, VectorLoad(0*xorIncrement, xorBlocks));
+				//xorBlocks += xorIncrement;
+				block1 = VectorXor(block1, VectorLoad(1*xorIncrement, xorBlocks));
+				//xorBlocks += xorIncrement;
+				block2 = VectorXor(block2, VectorLoad(2*xorIncrement, xorBlocks));
+				//xorBlocks += xorIncrement;
+				block3 = VectorXor(block3, VectorLoad(3*xorIncrement, xorBlocks));
+				//xorBlocks += xorIncrement;
+				xorBlocks += 4*xorIncrement;
+			}
+
+			// I can't get Store to run faster using indexed offsets
+			VectorStore(block0, outBlocks);
+			outBlocks += outIncrement;
+			VectorStore(block1, outBlocks);
+			outBlocks += outIncrement;
+			VectorStore(block2, outBlocks);
+			outBlocks += outIncrement;
+			VectorStore(block3, outBlocks);
+			outBlocks += outIncrement;
+
+			length -= 4*blockSize;
+		}
+	}
+
+	while (length >= blockSize)
+	{
+		VectorType block = VectorLoad(inBlocks);
+
+		if (flags & BlockTransformation::BT_XorInput)
+			block = VectorXor(block, VectorLoad(xorBlocks));
+
+		if (flags & BlockTransformation::BT_InBlockIsCounter)
+			const_cast<byte *>(inBlocks)[15]++;
+
+		func1(block, subKeys, rounds);
+
+		if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
+			block = VectorXor(block, VectorLoad(xorBlocks));
+
+		VectorStore(block, outBlocks);
+
+		inBlocks += inIncrement;
+		outBlocks += outIncrement;
+		xorBlocks += xorIncrement;
+		length -= blockSize;
+	}
+
+	return length;
+}
+
+size_t Rijndael_Enc_AdvancedProcessBlocks_POWER8(const word32 *subKeys, size_t rounds,
+            const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
+{
+	return Rijndael_AdvancedProcessBlocks_POWER8(POWER8_Enc_Block, POWER8_Enc_4_Blocks,
+            subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
+}
+
+size_t Rijndael_Dec_AdvancedProcessBlocks_POWER8(const word32 *subKeys, size_t rounds,
+            const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
+{
+	return Rijndael_AdvancedProcessBlocks_POWER8(POWER8_Dec_Block, POWER8_Dec_4_Blocks,
+            subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
+}
+
 #endif  // CRYPTOPP_POWER8_AES_AVAILABLE
 NAMESPACE_END
diff --git a/rijndael.cpp b/rijndael.cpp
index 3e016d6f..43c7fa80 100644
--- a/rijndael.cpp
+++ b/rijndael.cpp
@@ -253,10 +253,10 @@ extern size_t Rijndael_Dec_AdvancedProcessBlocks_ARMV8(const word32 *subkeys, si
 #if (CRYPTOPP_POWER8_AES_AVAILABLE)
 extern void ByteReverseArrayLE(byte src[16]);
 
-extern void Rijndael_Enc_ProcessAndXorBlock_POWER8(const word32 *subkeys, size_t rounds,
-        const byte *inBlock, const byte *xorBlock, byte *outBlock);
-extern void Rijndael_Dec_ProcessAndXorBlock_POWER8(const word32 *subkeys, size_t rounds,
-        const byte *inBlock, const byte *xorBlock, byte *outBlock);
+extern size_t Rijndael_Enc_AdvancedProcessBlocks_POWER8(const word32 *subkeys, size_t rounds,
+        const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
+extern size_t Rijndael_Dec_AdvancedProcessBlocks_POWER8(const word32 *subkeys, size_t rounds,
+        const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
 #endif
 
 void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLen, const NameValuePairs &)
@@ -408,7 +408,7 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
 #if (CRYPTOPP_POWER8_AES_AVAILABLE)
 	if (HasAES())
 	{
-		(void)Rijndael_Enc_ProcessAndXorBlock_POWER8(m_key, m_rounds, inBlock, xorBlock, outBlock);
+		(void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
 		return;
 	}
 #endif
@@ -502,7 +502,7 @@ void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
 #if (CRYPTOPP_POWER8_AES_AVAILABLE)
 	if (HasAES())
 	{
-		(void)Rijndael_Dec_ProcessAndXorBlock_POWER8(m_key, m_rounds, inBlock, xorBlock, outBlock);
+		(void)Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
 		return;
 	}
 #endif
@@ -1130,7 +1130,7 @@ Rijndael::Enc::Enc() : m_aliasBlock(s_sizeToAllocate) { }
 
 #endif  // CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
 
-#if CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64
+#if CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64 || CRYPTOPP_BOOL_PPC32 || CRYPTOPP_BOOL_PPC64
 // Do nothing
 Rijndael::Enc::Enc() { }
 #endif
@@ -1146,6 +1146,10 @@ size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xo
 	if (HasAES())
 		return Rijndael_Enc_AdvancedProcessBlocks_ARMV8(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
 #endif
+#if CRYPTOPP_POWER8_AES_AVAILABLE
+	if (HasAES())
+		return Rijndael_Enc_AdvancedProcessBlocks_POWER8(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
+#endif
 
 #if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
 	if (HasSSE2())
@@ -1205,11 +1209,14 @@ size_t Rijndael::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xo
 	if (HasAESNI())
 		return Rijndael_Dec_AdvancedProcessBlocks_AESNI(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
 #endif
-
 #if CRYPTOPP_ARM_AES_AVAILABLE
 	if (HasAES())
 		return Rijndael_Dec_AdvancedProcessBlocks_ARMV8(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
 #endif
+#if CRYPTOPP_POWER8_AES_AVAILABLE
+	if (HasAES())
+		return Rijndael_Dec_AdvancedProcessBlocks_POWER8(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
+#endif
 
 	return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
 }
diff --git a/rijndael.h b/rijndael.h
index 483eab5a..b6cd0faf 100644
--- a/rijndael.h
+++ b/rijndael.h
@@ -12,7 +12,8 @@
 #include "seckey.h"
 #include "secblock.h"
 
-#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64
+#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_ARM32 || \
+	CRYPTOPP_BOOL_ARM64 || CRYPTOPP_BOOL_PPC32 || CRYPTOPP_BOOL_PPC64
 # define CRYPTOPP_ENABLE_ADVANCED_PROCESS_BLOCKS 1
 #endif