From 194307308cd08fe73035aa0cf2a03c42452847da Mon Sep 17 00:00:00 2001
From: Jeffrey Walton <noloader@gmail.com>
Date: Mon, 6 Aug 2018 18:06:32 -0400
Subject: [PATCH] Cleanup VPMSUM probes

---
 gcm-simd.cpp | 81 ++++++++++++++++++++++++++--------------------------
 1 file changed, 40 insertions(+), 41 deletions(-)

diff --git a/gcm-simd.cpp b/gcm-simd.cpp
index f58a4e05..123fefde 100644
--- a/gcm-simd.cpp
+++ b/gcm-simd.cpp
@@ -67,31 +67,6 @@ ANONYMOUS_NAMESPACE_BEGIN
 
 // ************************* Miscellaneous ************************* //
 
-// GCC 4.8 is missing PMULL gear
-#if (CRYPTOPP_ARM_PMULL_AVAILABLE)
-# if (CRYPTOPP_GCC_VERSION >= 40800) && (CRYPTOPP_GCC_VERSION < 49000)
-inline poly128_t VMULL_P64(poly64_t a, poly64_t b)
-{
-    return __builtin_aarch64_crypto_pmulldi_ppp (a, b);
-}
-
-inline poly128_t VMULL_HIGH_P64(poly64x2_t a, poly64x2_t b)
-{
-    return __builtin_aarch64_crypto_pmullv2di_ppp (a, b);
-}
-# else
-inline poly128_t VMULL_P64(poly64_t a, poly64_t b)
-{
-    return vmull_p64(a, b);
-}
-
-inline poly128_t VMULL_HIGH_P64(poly64x2_t a, poly64x2_t b)
-{
-    return vmull_high_p64(a, b);
-}
-# endif
-#endif
-
 #if CRYPTOPP_ARM_PMULL_AVAILABLE
 #if defined(__GNUC__)
 // Schneiders, Hovsmith and O'Rourke used this trick.
@@ -189,31 +164,55 @@ inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b)
 #endif // CRYPTOPP_ARM_PMULL_AVAILABLE
 
 #if CRYPTOPP_POWER8_PMULL_AVAILABLE
-using CryptoPP::uint8x16_p;
 using CryptoPP::uint64x2_p;
-using CryptoPP::VectorXor;
+using CryptoPP::VectorAnd;
 using CryptoPP::VectorShiftRight;
 
-inline uint64x2_p VMULL_P64(uint64x2_p a, uint64x2_p b)
+inline uint64x2_p VMULL_00(uint64x2_p a, uint64x2_p b)
 {
 	// Multiplies low dwords
 #if defined(__xlc__) || defined(__xlC__)
-	return __vpmsumd (a, b);
+	const uint64x2_p m = {0xffffffffffffffffull, 0};
+	return __vpmsumd (VectorAnd(a, m), VectorAnd(b, m));
 #else
-	return __builtin_crypto_vpmsumd (a, b);
+	const uint64x2_p m = {0xffffffffffffffffull, 0};
+	return __builtin_crypto_vpmsumd (VectorAnd(a, m), VectorAnd(b, m));
 #endif
 }
 
-inline uint64x2_p VMULL_HIGH_P64(uint64x2_p a, uint64x2_p b)
+inline uint64x2_p VMULL_01(uint64x2_p a, uint64x2_p b)
 {
+	// Multiplies high and low dwords
 #if defined(__xlc__) || defined(__xlC__)
-	const uint64x2_p s = VectorShiftRight<8>(a);
-	const uint64x2_p t = VectorShiftRight<8>(b);
-	return __vpmsumd (s, t);
+	const uint64x2_p m = {0xffffffffffffffffull, 0};
+	return __vpmsumd (VectorAnd(a, m), VectorShiftRight<8>(b));
 #else
-	const uint64x2_p s = VectorShiftRight<8>(a);
-	const uint64x2_p t = VectorShiftRight<8>(b);
-	return __builtin_crypto_vpmsumd (s, t);
+	const uint64x2_p m = {0xffffffffffffffffull, 0};
+	return __builtin_crypto_vpmsumd (VectorAnd(a, m), VectorShiftRight<8>(b));
+#endif
+}
+
+inline uint64x2_p VMULL_10(uint64x2_p a, uint64x2_p b)
+{
+	// Multiplies high and low dwords
+#if defined(__xlc__) || defined(__xlC__)
+	const uint64x2_p m = {0xffffffffffffffffull, 0};
+	return __vpmsumd (VectorShiftRight<8>(a), VectorAnd(b, m));
+#else
+	const uint64x2_p m = {0xffffffffffffffffull, 0};
+	return __builtin_crypto_vpmsumd (VectorShiftRight<8>(a), VectorAnd(b, m));
+#endif
+}
+
+inline uint64x2_p VMULL_11(uint64x2_p a, uint64x2_p b)
+{
+	// Multiplies high dwords
+#if defined(__xlc__) || defined(__xlC__)
+	const uint64x2_p m = {0, 0xffffffffffffffffull};
+	return __vpmsumd (VectorAnd(a, m), VectorAnd(b, m));
+#else
+	const uint64x2_p m = {0, 0xffffffffffffffffull};
+	return __builtin_crypto_vpmsumd (VectorAnd(a, m), VectorAnd(b, m));
 #endif
 }
 #endif // CRYPTOPP_POWER8_PMULL_AVAILABLE
@@ -293,8 +292,8 @@ bool CPU_ProbePMULL()
                          b2={0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,
                              0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0};
 
-        const poly128_t r1 = VMULL_P64(a1, b1);
-        const poly128_t r2 = VMULL_HIGH_P64((poly64x2_t)(a2), (poly64x2_t)(b2));
+        const poly128_t r1 = VMULL_00(a1, b1);
+        const poly128_t r2 = VMULL_11((poly64x2_t)(a2), (poly64x2_t)(b2));
 
         // Linaro is missing vreinterpretq_u64_p128. Also see http://github.com/weidai11/cryptopp/issues/233.
         const uint64x2_t t1 = (uint64x2_t)(r1);  // {bignum,bignum}
@@ -344,8 +343,8 @@ bool CPU_ProbePMULL()
                          b2={0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,
                              0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0};
 
-        const uint64x2_p r1 = VMULL_P64(a1, b1);
-        const uint64x2_p r2 = VMULL_HIGH_P64((uint64x2_p)(a2), (uint64x2_p)(b2));
+        const uint64x2_p r1 = VMULL_00(a1, b1);
+        const uint64x2_p r2 = VMULL_11((uint64x2_p)(a2), (uint64x2_p)(b2));
 
         word64 w1[2], w2[2];
 		VectorStore(r1, (byte*)w1); VectorStore(r2, (byte*)w2);