From 3e55817819b10664351561d6ba07869ef89c4491 Mon Sep 17 00:00:00 2001
From: Jeffrey Walton <noloader@gmail.com>
Date: Fri, 22 Sep 2017 04:15:33 -0400
Subject: [PATCH] Add C++ templates for additional Vector ops Removed
 lower-level C-like functions such as Store8x16 and Store64x2

---
 rijndael-simd.cpp | 139 ++++++++++++++++------------------------------
 1 file changed, 47 insertions(+), 92 deletions(-)
diff --git a/rijndael-simd.cpp b/rijndael-simd.cpp
index f2946cb8..1e553c0e 100644
--- a/rijndael-simd.cpp
+++ b/rijndael-simd.cpp
@@ -776,6 +776,12 @@ typedef __vector unsigned char      uint8x16_p8;
 typedef __vector unsigned int       uint32x4_p8;
 typedef __vector unsigned long long uint64x2_p8;
 
+#if defined(CRYPTOPP_XLC_VERSION)
+typedef uint8x16_p8 VectorType;
+#elif defined(CRYPTOPP_GCC_VERSION)
+typedef uint64x2_p8 VectorType;
+#endif
+
 void ReverseByteArrayLE(byte src[16])
 {
 #if defined(CRYPTOPP_XLC_VERSION) && defined(IS_LITTLE_ENDIAN)
@@ -787,92 +793,48 @@ void ReverseByteArrayLE(byte src[16])
 #endif
 }
 
-static inline uint8x16_p8 Reverse8x16(const uint8x16_p8& src)
+template <class T1>
+static inline T1 Reverse(const T1& src)
 {
 	const uint8x16_p8 mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
 	const uint8x16_p8 zero = {0};
 	return vec_perm(src, zero, mask);
 }
 
-static inline uint64x2_p8 Reverse64x2(const uint64x2_p8& src)
-{
-	const uint8x16_p8 mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
-	const uint8x16_p8 zero = {0};
-	return (uint64x2_p8)vec_perm((uint8x16_p8)src, zero, mask);
-}
-
-static inline uint8x16_p8 Load8x16(const uint8_t src[16])
+static inline VectorType VectorLoadBE(const uint8_t src[16])
 {
 #if defined(CRYPTOPP_XLC_VERSION)
-	return vec_xl_be(0, (uint8_t*)src);
+	return (VectorType)vec_xl_be(0, (uint8_t*)src);
 #else
 # if defined(IS_LITTLE_ENDIAN)
-	return Reverse8x16(vec_vsx_ld(0, (uint8_t*)src));
+	return (VectorType)Reverse(vec_vsx_ld(0, (uint8_t*)src));
 # else
-	return vec_vsx_ld(0, (uint8_t*)src);
+	return (VectorType)vec_vsx_ld(0, (uint8_t*)src);
 # endif
 #endif
 }
 
-static inline uint8x16_p8 Load8x16(int off, const uint8_t src[16])
+static inline VectorType VectorLoadBE(int off, const uint8_t src[16])
 {
 #if defined(CRYPTOPP_XLC_VERSION)
-	return vec_xl_be(off, (uint8_t*)src);
+	return (VectorType)vec_xl_be(off, (uint8_t*)src);
 #else
 # if defined(IS_LITTLE_ENDIAN)
-	return Reverse8x16(vec_vsx_ld(off, (uint8_t*)src));
+	return (VectorType)Reverse(vec_vsx_ld(off, (uint8_t*)src));
 # else
-	return vec_vsx_ld(off, (uint8_t*)src);
+	return (VectorType)vec_vsx_ld(off, (uint8_t*)src);
 # endif
 #endif
 }
 
-static inline void Store8x16(const uint8x16_p8& src, uint8_t dest[16])
-{
-#if defined(CRYPTOPP_XLC_VERSION)
-	vec_xst_be(src, 0, (uint8_t*)dest);
-#else
-# if defined(IS_LITTLE_ENDIAN)
-	vec_vsx_st(Reverse8x16(src), 0, (uint8_t*)dest);
-# else
-	vec_vsx_st(src, 0, (uint8_t*)dest);
-# endif
-#endif
-}
-
-static inline uint64x2_p8 Load64x2(const uint8_t src[16])
-{
-#if defined(CRYPTOPP_XLC_VERSION)
-	return (uint64x2_p8)vec_xl_be(0, (uint8_t*)src);
-#else
-# if defined(IS_LITTLE_ENDIAN)
-	return Reverse64x2((uint64x2_p8)vec_vsx_ld(0, (uint8_t*)src));
-# else
-	return (uint64x2_p8)vec_vsx_ld(0, (uint8_t*)src);
-# endif
-#endif
-}
-
-static inline uint64x2_p8 Load64x2(int off, const uint8_t src[16])
-{
-#if defined(CRYPTOPP_XLC_VERSION)
-	return (uint64x2_p8)vec_xl_be(off, (uint8_t*)src);
-#else
-# if defined(IS_LITTLE_ENDIAN)
-	return (uint64x2_p8)Reverse8x16(vec_vsx_ld(off, (uint8_t*)src));
-# else
-	return (uint64x2_p8)vec_vsx_ld(off, (uint8_t*)src);
-# endif
-#endif
-}
-
-static inline void Store64x2(const uint64x2_p8& src, uint8_t dest[16])
+template <class T1>
+static inline void VectorStoreBE(const T1& src, uint8_t dest[16])
 {
 #if defined(CRYPTOPP_XLC_VERSION)
 	vec_xst_be((uint8x16_p8)src, 0, (uint8_t*)dest);
 #else
 # if defined(IS_LITTLE_ENDIAN)
-	vec_vsx_st((uint8x16_p8)Reverse64x2(src), 0, (uint8_t*)dest);
+	vec_vsx_st(Reverse((uint8x16_p8)src), 0, (uint8_t*)dest);
 # else
 	vec_vsx_st((uint8x16_p8)src, 0, (uint8_t*)dest);
 # endif
@@ -881,22 +843,16 @@ static inline void Store64x2(const uint64x2_p8& src, uint8_t dest[16])
 
 //////////////////////////////////////////////////////////////////
 
-#if defined(CRYPTOPP_XLC_VERSION)
-	typedef uint8x16_p8 VectorType;
-#elif defined(CRYPTOPP_GCC_VERSION)
-	typedef uint64x2_p8 VectorType;
-#endif
-
 // Loads a mis-aligned byte array, performs an endian conversion.
 static inline VectorType VectorLoad(const byte src[16])
 {
-	return (VectorType)Load8x16(0, (uint8_t*)src);
+	return (VectorType)VectorLoadBE((uint8_t*)src);
 }
 
 // Loads a mis-aligned byte array, performs an endian conversion.
 static inline VectorType VectorLoad(int off, const byte src[16])
 {
-	return (VectorType)Load8x16(off, (uint8_t*)src);
+	return (VectorType)VectorLoadBE(off, (uint8_t*)src);
 }
 
 // Loads a byte array, does not perform an endian conversion.
@@ -921,15 +877,16 @@ static inline VectorType VectorLoadKey(int off, const byte src[16])
 }
 
 // Stores to a mis-aligned byte array, performs an endian conversion.
-static inline void VectorStore(const uint8x16_p8& src, byte dest[16])
+template<class T1>
+static inline void VectorStore(const T1& src, byte dest[16])
 {
-	return Store8x16(src, (uint8_t*)dest);
+	return VectorStoreBE(src, (uint8_t*)dest);
 }
 
-// Stores to a mis-aligned byte array, performs an endian conversion.
-static inline void VectorStore(const uint64x2_p8& src, byte dest[16])
+template <class T1, class T2>
+static inline T1 VectorPermute(const T1& vec1, const T1& vec2, const T2& mask)
 {
-	return Store64x2(src, (uint8_t*)dest);
+	return (T1)vec_perm(vec1, vec2, (uint8x16_p8)mask);
 }
 
 template <class T1, class T2>
@@ -944,6 +901,16 @@ static inline T1 VectorAdd(const T1& vec1, const T2& vec2)
 	return (T1)vec_add(vec1, (T1)vec2);
 }
 
+template <int C, class T1, class T2>
+static inline T1 VectorShiftLeft(const T1& vec1, const T2& vec2)
+{
+#if defined(IS_LITTLE_ENDIAN)
+	return (T1)vec_sld((uint8x16_p8)vec2, (uint8x16_p8)vec1, 16-C);
+#else
+	return (T1)vec_sld((uint8x16_p8)vec1, (uint8x16_p8)vec2, C);
+#endif
+}
+
 template <class T1, class T2>
 static inline T1 VectorEncrypt(const T1& state, const T2& key)
 {
@@ -1027,33 +994,21 @@ Rijndael_Subkey_POWER8(uint8x16_p8 r1, const uint8x16_p8 r4, const uint8x16_p8 r
 	const uint8x16_p8 r0 = {0};
 	uint8x16_p8 r3, r6;
 
-#if defined(IS_LITTLE_ENDIAN)
-	r3 = vec_perm(r1, r1, r5);       /* line  1 */
-	r6 = vec_sld(r1, r0, 4);         /* line  2 */
-	r3 = VectorEncryptLast(r3, r4);  /* line  3 */
+	r3 = VectorPermute(r1, r1, r5);     /* line  1 */
+	r6 = VectorShiftLeft<12>(r0, r1);   /* line  2 */
+	r3 = VectorEncryptLast(r3, r4);     /* line  3 */
 
-	r1 = vec_xor(r1, r6);            /* line  4 */
-	r6 = vec_sld(r6, r0, 4);         /* line  5 */
-	r1 = vec_xor(r1, r6);            /* line  6 */
-	r6 = vec_sld(r6, r0, 4);         /* line  7 */
-	r1 = vec_xor(r1, r6);            /* line  8 */
-#else
-	r3 = vec_perm(r1, r1, r5);       /* line  1 */
-	r6 = vec_sld(r0, r1, 12);        /* line  2 */
-	r3 = VectorEncryptLast(r3, r4);  /* line  3 */
-
-	r1 = vec_xor(r1, r6);            /* line  4 */
-	r6 = vec_sld(r0, r6, 12);        /* line  5 */
-	r1 = vec_xor(r1, r6);            /* line  6 */
-	r6 = vec_sld(r0, r6, 12);        /* line  7 */
-	r1 = vec_xor(r1, r6);            /* line  8 */
-#endif
+	r1 = VectorXor(r1, r6);             /* line  4 */
+	r6 = VectorShiftLeft<12>(r0, r1);   /* line  5 */
+	r1 = VectorXor(r1, r6);             /* line  6 */
+	r6 = VectorShiftLeft<12>(r0, r1);   /* line  7 */
+	r1 = VectorXor(r1, r6);             /* line  8 */
 
 	// Caller handles r4 (rcon) addition
-	// r4 = vec_add(r4, r4);         /* line  9 */
+	// r4 = VectorAdd(r4, r4);          /* line  9 */
 
 	// r1 is ready for next round
-	r1 = vec_xor(r1, r3);            /* line 10 */
+	r1 = VectorXor(r1, r3);             /* line 10 */
 	return r1;
 }