Fix LLVM Clang compile on PowerPC

2018-11-19 02:28:29 -05:00 · 2018-11-19 02:28:29 -05:00 · 3129ad4d70
parent c9f1a26024
commit 3129ad4d70
8 changed files with 124 additions and 67 deletions
--- a/73
+++ b/73
@ -71,8 +71,8 @@ IS_MINGW := $(shell echo "$(SYSTEMX)" | $(GREP) -i -c "MinGW")
 IS_CYGWIN := $(shell echo "$(SYSTEMX)" | $(GREP) -i -c "Cygwin")
 IS_DARWIN := $(shell echo "$(SYSTEMX)" | $(GREP) -i -c "Darwin")
 IS_NETBSD := $(shell echo "$(SYSTEMX)" | $(GREP) -i -c "NetBSD")
-IS_AIX := $(shell echo "$(UNAMEX)" | $(GREP) -i -c "aix")
-IS_SUN := $(shell echo "$(UNAMEX)" | $(GREP) -i -c "SunOS")
+IS_AIX := $(shell echo "$(SYSTEMX)" | $(GREP) -i -c "aix")
+IS_SUN := $(shell echo "$(SYSTEMX)" | $(GREP) -i -c "SunOS")

 SUN_COMPILER := $(shell $(CXX) -V 2>&1 | $(GREP) -i -c -E 'CC: (Sun|Studio)')
 GCC_COMPILER := $(shell $(CXX) --version 2>/dev/null | $(GREP) -v -E '(llvm|clang)' | $(GREP) -i -c -E '(gcc|g\+\+)')
@ -118,8 +118,8 @@ endif
 # Fixup AIX
 ifeq ($(IS_AIX),1)
  TPROG = TestPrograms/test_64bit.cxx
-  HAVE_OPT = $(shell $(CXX) $(CXXFLAGS) $(ZOPT) $(TPROG) -o $(TOUT) 2>&1 | $(GREP) -i -c -E $(BAD_RESULT))
-  ifeq ($(HAVE_OPT),0)
+  HAVE_OPT = $(shell $(CXX) $(CXXFLAGS) $(ZOPT) $(TPROG) -o $(TOUT) 2>&1 | tr ' ' '\n' | wc -l)
+  ifeq ($(strip $(HAVE_OPT)),0)
    IS_PPC64=1
  else
    IS_PPC32=1
@ -623,17 +623,51 @@ ifeq ($(DETECT_FEATURES),1)
  #  endif
  #endif

+  #####################################################################
+  # AES is a separate submodule of POWER8 due to possible export
+  # restrictions by the government. It is the reason LLVM choose
+  # different intrinsics than GCC and XLC.
+
+  TPROG = TestPrograms/test_ppc_aes.cxx
+  TOPT = $(POWER9_FLAG)
+  HAVE_OPT = $(shell $(CXX) $(CXXFLAGS) $(ZOPT) $(TOPT) $(TPROG) -o $(TOUT) 2>&1 | tr ' ' '\n' | wc -l)
+  ifeq ($(strip $(HAVE_OPT)),0)
+    AES_FLAG = $(POWER9_FLAG)
+  endif
+
+  TPROG = TestPrograms/test_ppc_aes.cxx
+  TOPT = $(POWER8_FLAG)
+  HAVE_OPT = $(shell $(CXX) $(CXXFLAGS) $(ZOPT) $(TOPT) $(TPROG) -o $(TOUT) 2>&1 | tr ' ' '\n' | wc -l)
+  ifeq ($(strip $(HAVE_OPT)),0)
+    AES_FLAG = $(POWER8_FLAG)
+  endif
+
+  TPROG = TestPrograms/test_ppc_sha.cxx
+  TOPT = $(POWER9_FLAG)
+  HAVE_OPT = $(shell $(CXX) $(CXXFLAGS) $(ZOPT) $(TOPT) $(TPROG) -o $(TOUT) 2>&1 | tr ' ' '\n' | wc -l)
+  ifeq ($(strip $(HAVE_OPT)),0)
+    SHA_FLAG = $(POWER9_FLAG)
+  endif
+
+  TPROG = TestPrograms/test_ppc_sha.cxx
+  TOPT = $(POWER8_FLAG)
+  HAVE_OPT = $(shell $(CXX) $(CXXFLAGS) $(ZOPT) $(TOPT) $(TPROG) -o $(TOUT) 2>&1 | tr ' ' '\n' | wc -l)
+  ifeq ($(strip $(HAVE_OPT)),0)
+    SHA_FLAG = $(POWER8_FLAG)
+  endif
+
+  #####################################################################
+  # Looking for a POWER8 option
+
  TPROG = TestPrograms/test_ppc_power8.cxx
  TOPT = $(POWER9_FLAG)
  HAVE_OPT = $(shell $(CXX) $(CXXFLAGS) $(ZOPT) $(TOPT) $(TPROG) -o $(TOUT) 2>&1 | tr ' ' '\n' | wc -l)
  ifeq ($(strip $(HAVE_OPT)),0)
    ALTIVEC_FLAG = $(POWER9_FLAG)
-    AES_FLAG = $(POWER9_FLAG)
    BLAKE2B_FLAG = $(POWER9_FLAG)
    BLAKE2S_FLAG = $(POWER9_FLAG)
    CHACHA_FLAG = $(POWER9_FLAG)
    GCM_FLAG = $(POWER9_FLAG)
-    SHA_FLAG = $(POWER9_FLAG)
    SM4_FLAG = $(POWER9_FLAG)
    SIMON64_FLAG = $(POWER9_FLAG)
    SIMON128_FLAG = $(POWER9_FLAG)
@ -648,12 +682,10 @@ ifeq ($(DETECT_FEATURES),1)
  HAVE_OPT = $(shell $(CXX) $(CXXFLAGS) $(ZOPT) $(TOPT) $(TPROG) -o $(TOUT) 2>&1 | tr ' ' '\n' | wc -l)
  ifeq ($(strip $(HAVE_OPT)),0)
    ALTIVEC_FLAG = $(POWER8_FLAG)
-    AES_FLAG = $(POWER8_FLAG)
    BLAKE2B_FLAG = $(POWER8_FLAG)
    BLAKE2S_FLAG = $(POWER8_FLAG)
    CHACHA_FLAG = $(POWER8_FLAG)
    GCM_FLAG = $(POWER8_FLAG)
-    SHA_FLAG = $(POWER8_FLAG)
    SM4_FLAG = $(POWER8_FLAG)
    SIMON64_FLAG = $(POWER8_FLAG)
    SIMON128_FLAG = $(POWER8_FLAG)
@ -663,6 +695,9 @@ ifeq ($(DETECT_FEATURES),1)
    POWER8_FLAG =
  endif

+  #####################################################################
+  # Looking for a POWER7 option
+
  TPROG = TestPrograms/test_ppc_power7.cxx
  TOPT = $(POWER7_FLAG)
  HAVE_OPT = $(shell $(CXX) $(CXXFLAGS) $(ZOPT) $(TOPT) $(TPROG) -o $(TOUT) 2>&1 | tr ' ' '\n' | wc -l)
@ -680,6 +715,9 @@ ifeq ($(DETECT_FEATURES),1)
    POWER7_FLAG =
  endif

+  #####################################################################
+  # Looking for an Altivec option
+
  TPROG = TestPrograms/test_ppc_altivec.cxx
  TOPT = $(POWER6_FLAG)
  HAVE_OPT = $(shell $(CXX) $(CXXFLAGS) $(ZOPT) $(TOPT) $(TPROG) -o $(TOUT) 2>&1 | tr ' ' '\n' | wc -l)
@ -707,6 +745,9 @@ ifeq ($(DETECT_FEATURES),1)
    POWER4_FLAG =
  endif

+  #####################################################################
+  # Fixups for algorithms that can drop to a lower ISA, if needed
+
  # Drop to Power7 if Power8 is not available.
  ifeq ($(POWER8_FLAG),)
    GCM_FLAG = $(POWER7_FLAG)
@ -720,6 +761,9 @@ ifeq ($(DETECT_FEATURES),1)
    SPECK64_FLAG = $(ALTIVEC_FLAG)
  endif

+  #####################################################################
+  # Fixups for missing ISAs
+
  ifeq ($(ALTIVEC_FLAG),)
    CXXFLAGS += -DCRYPTOPP_DISABLE_ALTIVEC
  else ifeq ($(POWER9_FLAG)$(POWER8_FLAG)$(POWER7_FLAG),)
@ -728,6 +772,19 @@ ifeq ($(DETECT_FEATURES),1)
    CXXFLAGS += -DCRYPTOPP_DISABLE_POWER8
  endif

+  #####################################################################
+  # Fixups for missing crypto
+
+  ifneq ($(POWER9_FLAG)$(POWER8_FLAG),)
+    ifeq ($(AES_FLAG),)
+      CXXFLAGS += -DCRYPTOPP_DISABLE_POWER8_AES
+    endif
+    ifeq ($(SHA_FLAG),)
+      CXXFLAGS += -DCRYPTOPP_DISABLE_POWER8_SHA
+    endif
+	# CXXFLAGS += -DCRYPTOPP_DISABLE_POWER8_VMULL
+  endif
+
 # DETECT_FEATURES
 endif

--- a/blake2s_simd.cpp
+++ b/blake2s_simd.cpp
@ -812,6 +812,9 @@ inline uint32x4_p VectorSet32(const uint32x4_p a, const uint32x4_p b)
        const uint8x16_p mask = {12,13,14,15, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
        return VecPermute(a, VecShiftLeftOctet<12>(b), mask);
    }
+
+    // Quiet IBM XLC warning
+    return VecXor(a, a);
 }

 template <unsigned int E1, unsigned int E2, unsigned int E3, unsigned int E4>
@ -1005,14 +1008,14 @@ void BLAKE2_Compress32_CORE(const byte* input, BLAKE2s_State& state)

 void BLAKE2_Compress32_POWER7(const byte* input, BLAKE2s_State& state)
 {
-	BLAKE2_Compress32_CORE(input, state);
+    BLAKE2_Compress32_CORE(input, state);
 }

 #elif (CRYPTOPP_ALTIVEC_AVAILABLE)

 void BLAKE2_Compress32_ALTIVEC(const byte* input, BLAKE2s_State& state)
 {
-	BLAKE2_Compress32_CORE(input, state);
+    BLAKE2_Compress32_CORE(input, state);
 }

 #endif
--- a/gcm_simd.cpp
+++ b/gcm_simd.cpp
@ -64,7 +64,7 @@ extern const char GCM_SIMD_FNAME[] = __FILE__;

 ANONYMOUS_NAMESPACE_BEGIN

-// ************************* Miscellaneous ************************* //
+// *************************** ARM NEON *************************** //

 #if CRYPTOPP_ARM_PMULL_AVAILABLE
 #if defined(__GNUC__)
@ -168,7 +168,10 @@ inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b)
 #endif // Microsoft and compatibles
 #endif // CRYPTOPP_ARM_PMULL_AVAILABLE

+// ************************** Power 8 Crypto ************************** //
+
 #if CRYPTOPP_POWER8_VMULL_AVAILABLE
+
 using CryptoPP::uint32x4_p;
 using CryptoPP::uint64x2_p;
 using CryptoPP::VecGetLow;
@ -201,8 +204,10 @@ inline uint64x2_p VMULL2LE(const uint64x2_p& val)
 // _mm_clmulepi64_si128(a, b, 0x00)
 inline uint64x2_p VMULL_00LE(const uint64x2_p& a, const uint64x2_p& b)
 {
-#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
+#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
    return VMULL2LE(__vpmsumd (VecGetHigh(a), VecGetHigh(b)));
+#elif defined(__clang__)
+    return VMULL2LE(__builtin_altivec_crypto_vpmsumd (VecGetHigh(a), VecGetHigh(b)));
 #else
    return VMULL2LE(__builtin_crypto_vpmsumd (VecGetHigh(a), VecGetHigh(b)));
 #endif
@ -214,8 +219,10 @@ inline uint64x2_p VMULL_01LE(const uint64x2_p& a, const uint64x2_p& b)
    // Small speedup. VecGetHigh(b) ensures the high dword of 'b' is 0.
    // The 0 used in the vmull yields 0 for the high product, so the high
    // dword of 'a' is "don't care".
-#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
+#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
    return VMULL2LE(__vpmsumd (a, VecGetHigh(b)));
+#elif defined(__clang__)
+    return VMULL2LE(__builtin_altivec_crypto_vpmsumd (a, VecGetHigh(b)));
 #else
    return VMULL2LE(__builtin_crypto_vpmsumd (a, VecGetHigh(b)));
 #endif
@ -227,8 +234,10 @@ inline uint64x2_p VMULL_10LE(const uint64x2_p& a, const uint64x2_p& b)
    // Small speedup. VecGetHigh(a) ensures the high dword of 'a' is 0.
    // The 0 used in the vmull yields 0 for the high product, so the high
    // dword of 'b' is "don't care".
-#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
+#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
    return VMULL2LE(__vpmsumd (VecGetHigh(a), b));
+#elif defined(__clang__)
+    return VMULL2LE(__builtin_altivec_crypto_vpmsumd (VecGetHigh(a), b));
 #else
    return VMULL2LE(__builtin_crypto_vpmsumd (VecGetHigh(a), b));
 #endif
@ -240,8 +249,10 @@ inline uint64x2_p VMULL_11LE(const uint64x2_p& a, const uint64x2_p& b)
    // Small speedup. VecGetLow(a) ensures the high dword of 'a' is 0.
    // The 0 used in the vmull yields 0 for the high product, so the high
    // dword of 'b' is "don't care".
-#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
+#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
    return VMULL2LE(__vpmsumd (VecGetLow(a), b));
+#elif defined(__clang__)
+    return VMULL2LE(__builtin_altivec_crypto_vpmsumd (VecGetLow(a), b));
 #else
    return VMULL2LE(__builtin_crypto_vpmsumd (VecGetLow(a), b));
 #endif
--- a/ppc_power7.cpp
+++ b/ppc_power7.cpp
@ -65,7 +65,7 @@ extern "C" {
        byte b1[19] = {255, 255, 255, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, b2[17];

        // Specifically call the VSX loads and stores
-        #if defined(__xlc__) || defined(__xlC__)
+        #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
        vec_xst(vec_xl(0, b1+3), 0, b2+1);
        #else
        vec_vsx_st(vec_vsx_ld(0, b1+3), 0, b2+1);
--- a/ppc_power8.cpp
+++ b/ppc_power8.cpp
@ -66,7 +66,7 @@ bool CPU_ProbePower8()
        word64 w1[2] = {x, x}, w2[2] = {4, 6}, w3[2];

        // Specifically call the VSX loads and stores
-        #if defined(__xlc__) || defined(__xlC__)
+        #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
        const uint64x2_p v1 = (uint64x2_p)vec_xl(0, (byte*)w1);
        const uint64x2_p v2 = (uint64x2_p)vec_xl(0, (byte*)w2);
        const uint64x2_p v3 = VecAdd(v1, v2);  // 64-bit add
--- a/ppc_simd.h
+++ b/ppc_simd.h
@ -32,6 +32,12 @@
 # undef bool
 #endif

+// IBM XLC on AIX does not define __CRYPTO__ like it should. More LLVM goodness.
+#if defined(_AIX) && defined(__xlC__)
+# undef __CRYPTO__
+# define __CRYPTO__ 1
+#endif
+
 // VecLoad_ALTIVEC and VecStore_ALTIVEC are
 // too noisy on modern compilers
 #if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE
@ -879,7 +885,7 @@ inline bool VecNotEqual(const T1 vec1, const T2 vec2)

 //////////////////////// Power8 Crypto ////////////////////////

-#if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
+#if defined(__CRYPTO__) || defined(CRYPTOPP_DOXYGEN_PROCESSING)

 /// \brief One round of AES encryption
 /// \tparam T1 vector type
@ -893,8 +899,10 @@ inline bool VecNotEqual(const T1 vec1, const T2 vec2)
 template <class T1, class T2>
 inline T1 VecEncrypt(const T1 state, const T2 key)
 {
-#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
+#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
    return (T1)__vcipher((uint8x16_p)state, (uint8x16_p)key);
+#elif defined(__clang__)
+    return (T1)__builtin_altivec_crypto_vcipher((uint64x2_p)state, (uint64x2_p)key);
 #elif defined(__GNUC__)
    return (T1)__builtin_crypto_vcipher((uint64x2_p)state, (uint64x2_p)key);
 #else
@ -914,8 +922,10 @@ inline T1 VecEncrypt(const T1 state, const T2 key)
 template <class T1, class T2>
 inline T1 VecEncryptLast(const T1 state, const T2 key)
 {
-#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
+#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
    return (T1)__vcipherlast((uint8x16_p)state, (uint8x16_p)key);
+#elif defined(__clang__)
+    return (T1)__builtin_altivec_crypto_vcipherlast((uint64x2_p)state, (uint64x2_p)key);
 #elif defined(__GNUC__)
    return (T1)__builtin_crypto_vcipherlast((uint64x2_p)state, (uint64x2_p)key);
 #else
@ -935,8 +945,10 @@ inline T1 VecEncryptLast(const T1 state, const T2 key)
 template <class T1, class T2>
 inline T1 VecDecrypt(const T1 state, const T2 key)
 {
-#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
+#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
    return (T1)__vncipher((uint8x16_p)state, (uint8x16_p)key);
+#elif defined(__clang__)
+    return (T1)__builtin_altivec_crypto_vncipher((uint64x2_p)state, (uint64x2_p)key);
 #elif defined(__GNUC__)
    return (T1)__builtin_crypto_vncipher((uint64x2_p)state, (uint64x2_p)key);
 #else
@ -956,8 +968,10 @@ inline T1 VecDecrypt(const T1 state, const T2 key)
 template <class T1, class T2>
 inline T1 VecDecryptLast(const T1 state, const T2 key)
 {
-#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
+#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
    return (T1)__vncipherlast((uint8x16_p)state, (uint8x16_p)key);
+#elif defined(__clang__)
+    return (T1)__builtin_altivec_crypto_vncipherlast((uint64x2_p)state, (uint64x2_p)key);
 #elif defined(__GNUC__)
    return (T1)__builtin_crypto_vncipherlast((uint64x2_p)state, (uint64x2_p)key);
 #else
@ -977,8 +991,10 @@ inline T1 VecDecryptLast(const T1 state, const T2 key)
 template <int func, int subfunc, class T>
 inline T VecSHA256(const T vec)
 {
-#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
+#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
    return (T)__vshasigmaw((uint32x4_p)vec, func, subfunc);
+#elif defined(__clang__)
+    return (T)__builtin_altivec_crypto_vshasigmaw((uint32x4_p)vec, func, subfunc);
 #elif defined(__GNUC__)
    return (T)__builtin_crypto_vshasigmaw((uint32x4_p)vec, func, subfunc);
 #else
@ -998,8 +1014,10 @@ inline T VecSHA256(const T vec)
 template <int func, int subfunc, class T>
 inline T VecSHA512(const T vec)
 {
-#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
+#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
    return (T)__vshasigmad((uint64x2_p)vec, func, subfunc);
+#elif defined(__clang__)
+    return (T)__builtin_altivec_crypto_vshasigmad((uint64x2_p)vec, func, subfunc);
 #elif defined(__GNUC__)
    return (T)__builtin_crypto_vshasigmad((uint64x2_p)vec, func, subfunc);
 #else
@ -1007,7 +1025,7 @@ inline T VecSHA512(const T vec)
 #endif
 }

-#endif  // _ARCH_PWR8
+#endif  // __CRYPTO__

 #endif  // _ALTIVEC_

--- a/rijndael_simd.cpp
+++ b/rijndael_simd.cpp
@ -529,7 +529,7 @@ size_t Rijndael_Dec_AdvancedProcessBlocks_AESNI(const word32 *subKeys, size_t ro

 #endif  // CRYPTOPP_AESNI_AVAILABLE

-// ***************************** Power 8 ***************************** //
+// ************************** Power 8 Crypto ************************** //

 #if (CRYPTOPP_POWER8_AES_AVAILABLE)

--- a/sha_simd.cpp
+++ b/sha_simd.cpp
@ -222,7 +222,7 @@ bool CPU_ProbeSHA256()
    else
    {
        byte r[16], z[16] = {0};
-        uint8x16_p x = ((uint8x16_p){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0});
+        uint8x16_p x = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};

        x = VecSHA256<0,0>(x);
        x = VecSHA256<0,1>(x);
@ -1142,41 +1142,25 @@ uint32x4_p8 VectorMaj(const uint32x4_p8 x, const uint32x4_p8 y, const uint32x4_p
 static inline
 uint32x4_p8 Vector_sigma0(const uint32x4_p8 val)
 {
-#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
-    return __vshasigmaw(val, 0, 0);
-#else
-    return __builtin_crypto_vshasigmaw(val, 0, 0);
-#endif
+    return VecSHA256<0,0>(val);
 }

 static inline
 uint32x4_p8 Vector_sigma1(const uint32x4_p8 val)
 {
-#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
-    return __vshasigmaw(val, 0, 0xf);
-#else
-    return __builtin_crypto_vshasigmaw(val, 0, 0xf);
-#endif
+    return VecSHA256<0,0xf>(val);
 }

 static inline
 uint32x4_p8 VectorSigma0(const uint32x4_p8 val)
 {
-#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
-    return __vshasigmaw(val, 1, 0);
-#else
-    return __builtin_crypto_vshasigmaw(val, 1, 0);
-#endif
+    return VecSHA256<1,0>(val);
 }

 static inline
 uint32x4_p8 VectorSigma1(const uint32x4_p8 val)
 {
-#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
-    return __vshasigmaw(val, 1, 0xf);
-#else
-    return __builtin_crypto_vshasigmaw(val, 1, 0xf);
-#endif
+    return VecSHA256<1,0xf>(val);
 }

 static inline
@ -1417,41 +1401,25 @@ uint64x2_p8 VectorMaj(const uint64x2_p8 x, const uint64x2_p8 y, const uint64x2_p
 static inline
 uint64x2_p8 Vector_sigma0(const uint64x2_p8 val)
 {
-#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
-    return __vshasigmad(val, 0, 0);
-#else
-    return __builtin_crypto_vshasigmad(val, 0, 0);
-#endif
+    return VecSHA512<0,0>(val);
 }

 static inline
 uint64x2_p8 Vector_sigma1(const uint64x2_p8 val)
 {
-#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
-    return __vshasigmad(val, 0, 0xf);
-#else
-    return __builtin_crypto_vshasigmad(val, 0, 0xf);
-#endif
+    return VecSHA512<0,0xf>(val);
 }

 static inline
 uint64x2_p8 VectorSigma0(const uint64x2_p8 val)
 {
-#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
-    return __vshasigmad(val, 1, 0);
-#else
-    return __builtin_crypto_vshasigmad(val, 1, 0);
-#endif
+    return VecSHA512<1,0>(val);
 }

 static inline
 uint64x2_p8 VectorSigma1(const uint64x2_p8 val)
 {
-#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
-    return __vshasigmad(val, 1, 0xf);
-#else
-    return __builtin_crypto_vshasigmad(val, 1, 0xf);
-#endif
+    return VecSHA512<1,0xf>(val);
 }

 static inline