Rework Makefile and ppc-simd.h for XLC and LLVM front-end changes

2018-08-05 05:39:42 -04:00 · 2018-08-05 05:39:42 -04:00 · 1dd0e321a6
parent da00422d3c
commit 1dd0e321a6
5 changed files with 492 additions and 446 deletions
--- a/138
+++ b/138
@ -101,6 +101,16 @@ ifeq ($(wildcard adhoc.cpp),)
 $(shell cp adhoc.cpp.proto adhoc.cpp)
 endif
 # Fixup AIX
 ifeq ($(IS_AIX),1)
  BITNESS=$(shell getconf KERNEL_BITMODE)
  ifeq ($(BITNESS),64)
    IS_PPC64=1
  else
    IS_PPC32=1
  endif
 endif
 ###########################################################
 #####                General Variables                #####
 ###########################################################
@ -400,63 +410,93 @@ ifeq ($(IS_ARMV8),1)
  endif
 endif
-# PowerPC and PowerPC-64. Altivec is available with Power4
+# PowerPC and PowerPC-64. Altivec is available with Power4.
-ifneq ($(IS_PPC32)$(IS_PPC64)$(IS_AIX),000)
+# The tests below are crafted for IBM XLC and the LLVM front-end.
 # XLC/LLVM only supplies POWER8. So we set the flags for XLC/LLVM
 # and lower it if POWER7 or ALTIVEC is available. I've got a
 # feeling LLVM is going to cause a lot of troubles.
 ifneq ($(IS_PPC32)$(IS_PPC64),00)
  HAVE_POWER8 = $(shell $(CXX) $(CXXFLAGS) -DADHOC_MAIN -mcpu=power8 -maltivec -dM -E adhoc.cpp 2>&1 | $(GREP) -i -c -E '_ARCH_PWR8|_ARCH_PWR9|__CRYPTO')
  ifneq ($(HAVE_POWER8),0)
    POWER8_FLAG = -mcpu=power8 -maltivec
    AES_FLAG = $(POWER8_FLAG)
    GCM_FLAG = $(POWER8_FLAG)
    SHA_FLAG = $(POWER8_FLAG)
    SM4_FLAG = $(POWER8_FLAG)
  endif
  # GCC and some compatibles
  HAVE_POWER7 = $(shell $(CXX) $(CXXFLAGS) -DADHOC_MAIN -mcpu=power7 -maltivec -dM -E adhoc.cpp 2>&1 | $(GREP) -i -c '_ARCH_PWR7')
  ifneq ($(HAVE_POWER7),0)
    POWER7_FLAG = -mcpu=power7 -maltivec
    ARIA_FLAG = $(POWER7_FLAG)
    BLAKE2_FLAG = $(POWER7_FLAG)
    CHAM_FLAG = $(POWER7_FLAG)
    LEA_FLAG = $(POWER7_FLAG)
    SIMON_FLAG = $(POWER7_FLAG)
    SPECK_FLAG = $(POWER7_FLAG)
    SIMECK_FLAG = $(POWER7_FLAG)
  endif
  # GCC and some compatibles
  HAVE_ALTIVEC = $(shell $(CXX) $(CXXFLAGS) -DADHOC_MAIN -mcpu=power4 -maltivec -dM -E adhoc.cpp 2>&1 | $(GREP) -i -c '__ALTIVEC__')
  ifneq ($(HAVE_ALTIVEC),0)
    ALTIVEC_FLAG = -mcpu=power4 -maltivec
    ARIA_FLAG = -mcpu=power4 -maltivec
    BLAKE2_FLAG = -mcpu=power4 -maltivec
    CHAM_FLAG = -mcpu=power4 -maltivec
    LEA_FLAG = -mcpu=power4 -maltivec
    SIMON_FLAG = -mcpu=power4 -maltivec
    SPECK_FLAG = -mcpu=power4 -maltivec
    SIMECK_FLAG = -mcpu=power4 -maltivec
    SM4_FLAG = -mcpu=power7 -maltivec
  endif
  # GCC and some compatibles
  HAVE_CRYPTO = $(shell $(CXX) $(CXXFLAGS) -DADHOC_MAIN -mcpu=power8 -maltivec -dM -E adhoc.cpp 2>&1 | $(GREP) -i -c -E '_ARCH_PWR8|_ARCH_PWR9|__CRYPTO')
  ifneq ($(HAVE_CRYPTO),0)
    ALTIVEC_FLAG = -mcpu=power8 -maltivec
    AES_FLAG = -mcpu=power8 -maltivec
    GCM_FLAG = -mcpu=power8 -maltivec
    SHA_FLAG = -mcpu=power8 -maltivec
    CHAM_FLAG = -mcpu=power8 -maltivec
    LEA_FLAG = -mcpu=power8 -maltivec
    SIMON_FLAG = -mcpu=power8 -maltivec
    SPECK_FLAG = -mcpu=power8 -maltivec
    SIMECK_FLAG = -mcpu=power8 -maltivec
    SM4_FLAG = -mcpu=power8 -maltivec
  endif
  # IBM XL C/C++
-  HAVE_ALTIVEC = $(shell $(CXX) $(CXXFLAGS) -qshowmacros -qarch=pwr7 -qaltivec -E adhoc.cpp 2>&1 | $(GREP) -i -c '__ALTIVEC__')
+  HAVE_POWER8 = $(shell $(CXX) $(CXXFLAGS) -qshowmacros -qarch=pwr8 -qaltivec -E adhoc.cpp 2>&1 | $(GREP) -i -c -E '_ARCH_PWR8|_ARCH_PWR9|__CRYPTO')
  ifneq ($(HAVE_POWER8),0)
    POWER8_FLAG = -qarch=pwr8 -qaltivec
    AES_FLAG = $(POWER8_FLAG)
    GCM_FLAG = $(POWER8_FLAG)
    SHA_FLAG = $(POWER8_FLAG)
    SM4_FLAG = $(POWER8_FLAG)
  endif
  # IBM XL C/C++
  HAVE_POWER7 = $(shell $(CXX) $(CXXFLAGS) -qshowmacros -qarch=pwr7 -qaltivec -E adhoc.cpp 2>&1 | $(GREP) -i -c -E '_ARCH_PWR7')
  ifneq ($(HAVE_POWER7),0)
    POWER7_FLAG = -qarch=pwr7 -qaltivec
    ARIA_FLAG = $(POWER7_FLAG)
    BLAKE2_FLAG = $(POWER7_FLAG)
    CHAM_FLAG = $(POWER7_FLAG)
    LEA_FLAG = $(POWER7_FLAG)
    SIMECK_FLAG = $(POWER7_FLAG)
    SIMON_FLAG = $(POWER7_FLAG)
    SPECK_FLAG = $(POWER7_FLAG)
  endif
  # IBM XL C/C++
  HAVE_ALTIVEC = $(shell $(CXX) $(CXXFLAGS) -qshowmacros -qarch=pwr6 -qaltivec -E adhoc.cpp 2>&1 | $(GREP) -i -c '__ALTIVEC__')
  ifneq ($(HAVE_ALTIVEC),0)
-    ALTIVEC_FLAG = -qarch=pwr7 -qaltivec
+    ALTIVEC_FLAG = -qarch=pwr6 -qaltivec
    ARIA_FLAG = -qarch=pwr7 -qaltivec
    BLAKE2_FLAG = -qarch=pwr7 -qaltivec
    CHAM_FLAG = -qarch=pwr7 -qaltivec
    LEA_FLAG = -qarch=pwr7 -qaltivec
    SIMECK_FLAG = -qarch=pwr7 -qaltivec
    SIMON_FLAG = -qarch=pwr7 -qaltivec
    SPECK_FLAG = -qarch=pwr7 -qaltivec
    SM4_FLAG = -qarch=pwr7 -qaltivec
  endif
-  # IBM XL C/C++
+
-  HAVE_CRYPTO = $(shell $(CXX) $(CXXFLAGS) -qshowmacros -qarch=pwr8 -qaltivec -E adhoc.cpp 2>&1 | $(GREP) -i -c -E '_ARCH_PWR8|_ARCH_PWR9|__CRYPTO')
+  # LLVM front-ends only provide Power8. It really jambs us up
-  ifneq ($(HAVE_CRYPTO),0)
+  # for ppc-simd.cpp which needs ALTIVEC/POWER4. We have similar
-    ALTIVEC_FLAG = -qarch=pwr8 -qaltivec
+  # problems {lea|cham|simon|speck|...}-simd.cpp and POWER7.
-    AES_FLAG = -qarch=pwr8 -qaltivec
+  HAVE_LLVM = $(shell $(CXX) $(CXXFLAGS) -qshowmacros -E adhoc.cpp 2>&1 | $(GREP) -i -c '__llvm__')
-    GCM_FLAG = -qarch=pwr8 -qaltivec
+  ifneq ($(HAVE_LLVM),0)
-    SHA_FLAG = -qarch=pwr8 -qaltivec
+    POWER7_FLAG = $(POWER8_FLAG)
-    ARIA_FLAG = -qarch=pwr8 -qaltivec
+    ARIA_FLAG = $(POWER8_FLAG)
-    BLAKE2_FLAG = -qarch=pwr8 -qaltivec
+    BLAKE2_FLAG = $(POWER8_FLAG)
-    CHAM_FLAG = -qarch=pwr8 -qaltivec
+    CHAM_FLAG = $(POWER8_FLAG)
-    LEA_FLAG = -qarch=pwr8 -qaltivec
+    LEA_FLAG = $(POWER8_FLAG)
-    SIMECK_FLAG = -qarch=pwr8 -qaltivec
+    SIMECK_FLAG = $(POWER8_FLAG)
-    SIMON_FLAG = -qarch=pwr8 -qaltivec
+    SIMON_FLAG = $(POWER8_FLAG)
-    SPECK_FLAG = -qarch=pwr8 -qaltivec
+    SPECK_FLAG = $(POWER8_FLAG)
-    SM4_FLAG = -qarch=pwr8 -qaltivec
+    ALTIVEC_FLAG = $(POWER8_FLAG)
  endif
  ifeq ($(ALTIVEC_FLAG),)
    CXXFLAGS += -DCRYPTOPP_DISABLE_ALTIVEC
  endif
  ifeq ($(POWER7_FLAG),)
    CXXFLAGS += -DCRYPTOPP_DISABLE_POWER7
  endif
  ifeq ($(POWER8_FLAG),)
    CXXFLAGS += -DCRYPTOPP_DISABLE_POWER8
  endif
 endif
--- a/ppc-simd.cpp
+++ b/ppc-simd.cpp
@ -6,14 +6,6 @@
 //    is needed because additional CXXFLAGS are required to enable the
 //    appropriate instructions sets in some build configurations.
 // TODO: Bob Wilkinson reported we are misdetecting CRYPTOPP_POWER8_AVAILABLE.
 //    The problem is, the updated compiler supports them but the down-level
 //    assembler and linker do not. We will probably need to fix it through
 //    the makefile, similar to the way x86 AES and SHA are handled. For the time
 //    being CRYPTOPP_DISABLE_POWER8 will have to be applied manually. Another
 //    twist is, we don't have access to a test machine and it must be fixed
 //    for two compilers (IBM XL C/C++ and GCC). Ugh...
 #include "pch.h"
 #include "config.h"
 #include "stdcpp.h"
@ -53,7 +45,7 @@ bool CPU_ProbeAltivec()
 {
 #if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES)
    return false;
-#elif (CRYPTOPP_ALTIVEC_AVAILABLE) || (CRYPTOPP_POWER7_AVAILABLE) || (CRYPTOPP_POWER8_AVAILABLE)
+#elif (CRYPTOPP_ALTIVEC_AVAILABLE)
 # if defined(CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY)
    // longjmp and clobber warnings. Volatile is required.
@ -96,239 +88,5 @@ bool CPU_ProbeAltivec()
 #endif  // CRYPTOPP_ALTIVEC_AVAILABLE
 }
 bool CPU_ProbePower7()
 {
 #if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES)
    return false;
 #elif (CRYPTOPP_POWER7_AVAILABLE) || (CRYPTOPP_POWER8_AVAILABLE)
 # if defined(CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY)
    // longjmp and clobber warnings. Volatile is required.
    // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
    volatile int result = false;
    volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler);
    if (oldHandler == SIG_ERR)
        return false;
    volatile sigset_t oldMask;
    if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
        return false;
    if (setjmp(s_jmpSIGILL))
        result = false;
    else
    {
        // POWER7 added unaligned loads and store operations
        byte b1[19] = {255, 255, 255, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, b2[17];
        // Specifically call the VSX loads and stores
        #if defined(__xlc__) || defined(__xlC__)
        vec_xst(vec_xl(0, b1+3), 0, b2+1);
        #else
        vec_vsx_st(vec_vsx_ld(0, b1+3), 0, b2+1);
        #endif
        result = (0 == std::memcmp(b1+3, b2+1, 16));
    }
    sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
    signal(SIGILL, oldHandler);
    return result;
 # endif
 #else
    return false;
 #endif  // CRYPTOPP_POWER7_AVAILABLE
 }
 bool CPU_ProbePower8()
 {
 #if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES)
    return false;
 #elif (CRYPTOPP_POWER8_AVAILABLE)
 # if defined(CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY)
    // longjmp and clobber warnings. Volatile is required.
    // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
    volatile int result = true;
    volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler);
    if (oldHandler == SIG_ERR)
        return false;
    volatile sigset_t oldMask;
    if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
        return false;
    if (setjmp(s_jmpSIGILL))
        result = false;
    else
    {
        // POWER8 added 64-bit SIMD operations
        const word64 x = W64LIT(0xffffffffffffffff);
        word64 w1[2] = {x, x}, w2[2] = {4, 6}, w3[2];
        // Specifically call the VSX loads and stores
        #if defined(__xlc__) || defined(__xlC__)
        const uint64x2_p v1 = (uint64x2_p)vec_xl(0, (byte*)w1);
        const uint64x2_p v2 = (uint64x2_p)vec_xl(0, (byte*)w2);
        const uint64x2_p v3 = vec_add(v1, v2);  // 64-bit add
        vec_xst((uint8x16_p)v3, 0, (byte*)w3);
        #else
        const uint64x2_p v1 = (uint64x2_p)vec_vsx_ld(0, (byte*)w1);
        const uint64x2_p v2 = (uint64x2_p)vec_vsx_ld(0, (byte*)w2);
        const uint64x2_p v3 = vec_add(v1, v2);  // 64-bit add
        vec_vsx_st((uint8x16_p)v3, 0, (byte*)w3);
        #endif
        // Relies on integer wrap
        result = (w3[0] == 3 && w3[1] == 5);
    }
    sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
    signal(SIGILL, oldHandler);
    return result;
 # endif
 #else
    return false;
 #endif  // CRYPTOPP_POWER8_AVAILABLE
 }
 bool CPU_ProbeAES()
 {
 #if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES)
    return false;
 #elif (CRYPTOPP_POWER8_AVAILABLE)
 # if defined(CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY)
    // longjmp and clobber warnings. Volatile is required.
    // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
    volatile int result = true;
    volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler);
    if (oldHandler == SIG_ERR)
        return false;
    volatile sigset_t oldMask;
    if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
        return false;
    if (setjmp(s_jmpSIGILL))
        result = false;
    else
    {
        byte key[16] = {0xA0, 0xFA, 0xFE, 0x17, 0x88, 0x54, 0x2c, 0xb1,
                        0x23, 0xa3, 0x39, 0x39, 0x2a, 0x6c, 0x76, 0x05};
        byte state[16] = {0x19, 0x3d, 0xe3, 0xb3, 0xa0, 0xf4, 0xe2, 0x2b,
                          0x9a, 0xc6, 0x8d, 0x2a, 0xe9, 0xf8, 0x48, 0x08};
        byte r[16] = {255}, z[16] = {};
        uint8x16_p k = (uint8x16_p)VectorLoad(0, key);
        uint8x16_p s = (uint8x16_p)VectorLoad(0, state);
        s = VectorEncrypt(s, k);
        s = VectorEncryptLast(s, k);
        s = VectorDecrypt(s, k);
        s = VectorDecryptLast(s, k);
        VectorStore(s, r);
        result = (0 != std::memcmp(r, z, 16));
    }
    sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
    signal(SIGILL, oldHandler);
    return result;
 # endif
 #else
    return false;
 #endif  // CRYPTOPP_ALTIVEC_AVAILABLE
 }
 bool CPU_ProbeSHA256()
 {
 #if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES)
    return false;
 #elif (CRYPTOPP_POWER8_AVAILABLE)
 # if defined(CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY)
    // longjmp and clobber warnings. Volatile is required.
    // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
    volatile int result = false;
    volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler);
    if (oldHandler == SIG_ERR)
        return false;
    volatile sigset_t oldMask;
    if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
        return false;
    if (setjmp(s_jmpSIGILL))
        result = false;
    else
    {
        byte r[16], z[16] = {0};
        uint8x16_p x = ((uint8x16_p){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0});
        x = VectorSHA256<0,0>(x);
        x = VectorSHA256<0,1>(x);
        x = VectorSHA256<1,0>(x);
        x = VectorSHA256<1,1>(x);
        VectorStore(x, r);
        result = (0 == std::memcmp(r, z, 16));
    }
    sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
    signal(SIGILL, oldHandler);
    return result;
 # endif
 #else
    return false;
 #endif  // CRYPTOPP_ALTIVEC_AVAILABLE
 }
 bool CPU_ProbeSHA512()
 {
 #if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES)
    return false;
 #elif (CRYPTOPP_POWER8_AVAILABLE)
 # if defined(CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY)
    // longjmp and clobber warnings. Volatile is required.
    // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
    volatile int result = false;
    volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler);
    if (oldHandler == SIG_ERR)
        return false;
    volatile sigset_t oldMask;
    if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
        return false;
    if (setjmp(s_jmpSIGILL))
        result = false;
    else
    {
        byte r[16], z[16] = {0};
        uint8x16_p x = ((uint8x16_p){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0});
        x = VectorSHA512<0,0>(x);
        x = VectorSHA512<0,1>(x);
        x = VectorSHA512<1,0>(x);
        x = VectorSHA512<1,1>(x);
        VectorStore(x, r);
        result = (0 == std::memcmp(r, z, 16));
    }
    sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
    signal(SIGILL, oldHandler);
    return result;
 # endif
 #else
    return false;
 #endif  // CRYPTOPP_POWER8_AVAILABLE
 }
 # endif  // CRYPTOPP_BOOL_PPC32 || CRYPTOPP_BOOL_PPC64
 NAMESPACE_END
--- a/ppc-simd.h
+++ b/ppc-simd.h
@ -19,6 +19,26 @@
 #include "config.h"
 #include "misc.h"
 // We are boxed into undefining macros like CRYPTOPP_POWER8_AVAILABLE.
 // We set CRYPTOPP_POWER8_AVAILABLE based on compiler versions because
 // we needed them for the SIMD and non-SIMD files. When the SIMD file is
 // compiled it may only get -mcpu=power4 or -mcpu=power7, so the POWER7
 // or POWER8 stuff is not actually available when this header is included.
 #if !defined(__ALTIVEC__)
 # undef CRYPTOPP_ALTIVEC_AVAILABLE
 #endif
 #if !defined(_ARCH_PWR7)
 # undef CRYPTOPP_POWER7_AVAILABLE
 #endif
 #if !(defined(_ARCH_PWR8) || defined(_ARCH_PWR9) || defined(_CRYPTO))
 # undef CRYPTOPP_POWER8_AVAILABLE
 # undef CRYPTOPP_POWER8_AES_AVAILABLE
 # undef CRYPTOPP_POWER8_SHA_AVAILABLE
 # undef CRYPTOPP_POWER8_PMULL_AVAILABLE
 #endif
 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
 # include <altivec.h>
 # undef vector
@ -28,84 +48,112 @@
 NAMESPACE_BEGIN(CryptoPP)
 // Datatypes
 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
 typedef __vector unsigned char   uint8x16_p;
 typedef __vector unsigned short  uint16x8_p;
 typedef __vector unsigned int    uint32x4_p;
-
+#if defined(CRYPTOPP_POWER8_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
 #if defined(CRYPTOPP_POWER8_AVAILABLE)
 typedef __vector unsigned long long uint64x2_p;
 #endif
 #endif  // ALTIVEC/POWER4 datatypes
-#endif  // CRYPTOPP_ALTIVEC_AVAILABLE
+// POWER4 and above
 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
-#if defined(CRYPTOPP_ALTIVEC_AVAILABLE) && !defined(CRYPTOPP_POWER7_AVAILABLE)
+/// \brief Reverse a vector
-
+/// \tparam T vector type
-inline uint32x4_p VectorLoad(const byte src[16])
+/// \param src the vector
 /// \details Reverse() endian swaps the bytes in a vector
 /// \sa Reverse(), VectorLoadBE(), VectorLoad(), VectorLoadKey()
 /// \since Crypto++ 6.0
 template <class T>
 inline T Reverse(const T& src)
 {
    uint8x16_p data;
    if (IsAlignedOn(src, 16))
    {
        data = vec_ld(0, src);
    }
    else
    {
        // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
        const uint8x16_p perm = vec_lvsl(0, src);
        const uint8x16_p low = vec_ld(0, src);
        const uint8x16_p high = vec_ld(15, src);
        data = vec_perm(low, high, perm);
    }
 #if defined(CRYPTOPP_BIG_ENDIAN)
    return (uint32x4_p)data;
 #else
    const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
-    return (uint32x4_p)vec_perm(data, data, mask);
+    return vec_perm(src, src, mask);
 #endif
 }
-inline void VectorStore(const uint32x4_p data, byte dest[16])
+/// \brief Permutes two vectors
 /// \tparam T1 vector type
 /// \tparam T2 vector type
 /// \param vec1 the first vector
 /// \param vec2 the second vector
 /// \param mask vector mask
 /// \details VectorPermute returns a new vector from vec1 and vec2
 ///   based on mask. mask is an uint8x16_p type vector. The return
 ///   vector is the same type as vec1.
 /// \since Crypto++ 6.0
 template <class T1, class T2>
 inline T1 VectorPermute(const T1& vec1, const T1& vec2, const T2& mask)
 {
    return (T1)vec_perm(vec1, vec2, (uint8x16_p)mask);
 }
 /// \brief XOR two vectors
 /// \tparam T1 vector type
 /// \tparam T2 vector type
 /// \param vec1 the first vector
 /// \param vec2 the second vector
 /// \details VectorXor returns a new vector from vec1 and vec2. The return
 ///   vector is the same type as vec1.
 /// \since Crypto++ 6.0
 template <class T1, class T2>
 inline T1 VectorXor(const T1& vec1, const T2& vec2)
 {
    return (T1)vec_xor(vec1, (T1)vec2);
 }
 /// \brief Add two vector
 /// \tparam T1 vector type
 /// \tparam T2 vector type
 /// \param vec1 the first vector
 /// \param vec2 the second vector
 /// \details VectorAdd returns a new vector from vec1 and vec2.
 ///   vec2 is cast to the same type as vec1. The return vector
 ///   is the same type as vec1.
 /// \since Crypto++ 6.0
 template <class T1, class T2>
 inline T1 VectorAdd(const T1& vec1, const T2& vec2)
 {
    return (T1)vec_add(vec1, (T1)vec2);
 }
 /// \brief Shift two vectors left
 /// \tparam C shift byte count
 /// \tparam T1 vector type
 /// \tparam T2 vector type
 /// \param vec1 the first vector
 /// \param vec2 the second vector
 /// \details VectorShiftLeft() concatenates vec1 and vec2 and returns a
 ///   new vector after shifting the concatenation by the specified number
 ///   of bytes. Both vec1 and vec2 are cast to uint8x16_p. The return
 ///   vector is the same type as vec1.
 /// \details On big endian machines VectorShiftLeft() is <tt>vec_sld(a, b,
 ///   c)</tt>. On little endian machines VectorShiftLeft() is translated to
 ///   <tt>vec_sld(b, a, 16-c)</tt>. You should always call the function as
 ///   if on a big endian machine as shown below.
 /// <pre>
 ///    uint8x16_p r0 = {0};
 ///    uint8x16_p r1 = VectorLoad(ptr);
 ///    uint8x16_p r5 = VectorShiftLeft<12>(r0, r1);
 /// </pre>
 /// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
 ///   endian sensitive?</A> on Stack Overflow
 /// \since Crypto++ 6.0
 template <unsigned int C, class T1, class T2>
 inline T1 VectorShiftLeft(const T1& vec1, const T2& vec2)
 {
 #if defined(CRYPTOPP_LITTLE_ENDIAN)
-    const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
+    return (T1)vec_sld((uint8x16_p)vec2, (uint8x16_p)vec1, 16-C);
    const uint8x16_p t1 = (uint8x16_p)vec_perm(data, data, mask);
 #else
-    const uint8x16_p t1 = (uint8x16_p)data;
+    return (T1)vec_sld((uint8x16_p)vec1, (uint8x16_p)vec2, C);
 #endif
    if (IsAlignedOn(dest, 16))
    {
        vec_st(t1, 0,  dest);
    }
    else
    {
        // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
        const uint8x16_p t2 = vec_perm(t1, t1, vec_lvsr(0, dest));
        vec_ste((uint8x16_p) t2,  0, (unsigned char*) dest);
        vec_ste((uint16x8_p) t2,  1, (unsigned short*)dest);
        vec_ste((uint32x4_p) t2,  3, (unsigned int*)  dest);
        vec_ste((uint32x4_p) t2,  4, (unsigned int*)  dest);
        vec_ste((uint32x4_p) t2,  8, (unsigned int*)  dest);
        vec_ste((uint32x4_p) t2, 12, (unsigned int*)  dest);
        vec_ste((uint16x8_p) t2, 14, (unsigned short*)dest);
        vec_ste((uint8x16_p) t2, 15, (unsigned char*) dest);
    }
 }
-inline uint32x4_p VectorXor(const uint32x4_p vec1, const uint32x4_p vec2)
+#endif  // POWER4 and above
 {
    return vec_xor(vec1, vec2);
 }
 inline uint32x4_p VectorAdd(const uint32x4_p vec1, const uint32x4_p vec2)
 {
    return vec_add(vec1, vec2);
 }
 #endif
 // POWER7/POWER4 load and store
 #if defined(CRYPTOPP_POWER7_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
 /// \brief Reverse a 16-byte array
@ -124,19 +172,6 @@ inline void ReverseByteArrayLE(byte src[16])
 #endif
 }
 /// \brief Reverse a vector
 /// \tparam T vector type
 /// \param src the vector
 /// \details Reverse() endian swaps the bytes in a vector
 /// \sa Reverse(), VectorLoadBE(), VectorLoad(), VectorLoadKey()
 /// \since Crypto++ 6.0
 template <class T>
 inline T Reverse(const T& src)
 {
    const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
    return vec_perm(src, src, mask);
 }
 /// \brief Loads a vector from a byte array
 /// \param src the byte array
 /// \details Loads a vector in big endian format from a byte array.
@ -346,86 +381,65 @@ inline void VectorStore(const T& src, int off, byte dest[16])
 #endif
 }
-/// \brief Permutes two vectors
+#else  // not CRYPTOPP_POWER7_AVAILABLE
 /// \tparam T1 vector type
 /// \tparam T2 vector type
 /// \param vec1 the first vector
 /// \param vec2 the second vector
 /// \param mask vector mask
 /// \details VectorPermute returns a new vector from vec1 and vec2
 ///   based on mask. mask is an uint8x16_p type vector. The return
 ///   vector is the same type as vec1.
 /// \since Crypto++ 6.0
 template <class T1, class T2>
 inline T1 VectorPermute(const T1& vec1, const T1& vec2, const T2& mask)
 {
    return (T1)vec_perm(vec1, vec2, (uint8x16_p)mask);
 }
-/// \brief XOR two vectors
+// POWER7 is not available. Slow Altivec loads and stores.
-/// \tparam T1 vector type
+inline uint32x4_p VectorLoad(const byte src[16])
 /// \tparam T2 vector type
 /// \param vec1 the first vector
 /// \param vec2 the second vector
 /// \details VectorXor returns a new vector from vec1 and vec2. The return
 ///   vector is the same type as vec1.
 /// \since Crypto++ 6.0
 template <class T1, class T2>
 inline T1 VectorXor(const T1& vec1, const T2& vec2)
 {
-    return (T1)vec_xor(vec1, (T1)vec2);
+    uint8x16_p data;
-}
+    if (IsAlignedOn(src, 16))
    {
        data = vec_ld(0, src);
    }
    else
    {
        // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
        const uint8x16_p perm = vec_lvsl(0, src);
        const uint8x16_p low = vec_ld(0, src);
        const uint8x16_p high = vec_ld(15, src);
        data = vec_perm(low, high, perm);
    }
-/// \brief Add two vector
+#if defined(CRYPTOPP_BIG_ENDIAN)
-/// \tparam T1 vector type
+    return (uint32x4_p)data;
 /// \tparam T2 vector type
 /// \param vec1 the first vector
 /// \param vec2 the second vector
 /// \details VectorAdd returns a new vector from vec1 and vec2.
 ///   vec2 is cast to the same type as vec1. The return vector
 ///   is the same type as vec1.
 /// \since Crypto++ 6.0
 template <class T1, class T2>
 inline T1 VectorAdd(const T1& vec1, const T2& vec2)
 {
    return (T1)vec_add(vec1, (T1)vec2);
 }
 /// \brief Shift two vectors left
 /// \tparam C shift byte count
 /// \tparam T1 vector type
 /// \tparam T2 vector type
 /// \param vec1 the first vector
 /// \param vec2 the second vector
 /// \details VectorShiftLeft() concatenates vec1 and vec2 and returns a
 ///   new vector after shifting the concatenation by the specified number
 ///   of bytes. Both vec1 and vec2 are cast to uint8x16_p. The return
 ///   vector is the same type as vec1.
 /// \details On big endian machines VectorShiftLeft() is <tt>vec_sld(a, b,
 ///   c)</tt>. On little endian machines VectorShiftLeft() is translated to
 ///   <tt>vec_sld(b, a, 16-c)</tt>. You should always call the function as
 ///   if on a big endian machine as shown below.
 /// <pre>
 ///    uint8x16_p r0 = {0};
 ///    uint8x16_p r1 = VectorLoad(ptr);
 ///    uint8x16_p r5 = VectorShiftLeft<12>(r0, r1);
 /// </pre>
 /// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
 ///   endian sensitive?</A> on Stack Overflow
 /// \since Crypto++ 6.0
 template <unsigned int C, class T1, class T2>
 inline T1 VectorShiftLeft(const T1& vec1, const T2& vec2)
 {
 #if defined(CRYPTOPP_LITTLE_ENDIAN)
    return (T1)vec_sld((uint8x16_p)vec2, (uint8x16_p)vec1, 16-C);
 #else
-    return (T1)vec_sld((uint8x16_p)vec1, (uint8x16_p)vec2, C);
+    const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
    return (uint32x4_p)vec_perm(data, data, mask);
 #endif
 }
-#endif  // CRYPTOPP_POWER7_AVAILABLE
+inline void VectorStore(const uint32x4_p data, byte dest[16])
 {
 #if defined(CRYPTOPP_LITTLE_ENDIAN)
    const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
    const uint8x16_p t1 = (uint8x16_p)vec_perm(data, data, mask);
 #else
    const uint8x16_p t1 = (uint8x16_p)data;
 #endif
-#if defined(CRYPTOPP_POWER8_AES_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
+    if (IsAlignedOn(dest, 16))
    {
        vec_st(t1, 0,  dest);
    }
    else
    {
        // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
        const uint8x16_p t2 = vec_perm(t1, t1, vec_lvsr(0, dest));
        vec_ste((uint8x16_p) t2,  0, (unsigned char*) dest);
        vec_ste((uint16x8_p) t2,  1, (unsigned short*)dest);
        vec_ste((uint32x4_p) t2,  3, (unsigned int*)  dest);
        vec_ste((uint32x4_p) t2,  4, (unsigned int*)  dest);
        vec_ste((uint32x4_p) t2,  8, (unsigned int*)  dest);
        vec_ste((uint32x4_p) t2, 12, (unsigned int*)  dest);
        vec_ste((uint16x8_p) t2, 14, (unsigned short*)dest);
        vec_ste((uint8x16_p) t2, 15, (unsigned char*) dest);
    }
 }
 #endif  // POWER4/POWER7 load and store
 // POWER8 crypto
 #if defined(CRYPTOPP_POWER8_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
 /// \brief One round of AES encryption
 /// \tparam T1 vector type
@ -507,9 +521,9 @@ inline T1 VectorDecryptLast(const T1& state, const T2& key)
 #endif
 }
-#endif  // CRYPTOPP_POWER8_AES_AVAILABLE
+#endif  // POWER8 crypto
-#if defined(CRYPTOPP_POWER8_SHA_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
+#if defined(CRYPTOPP_POWER8_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
 /// \brief SHA256 Sigma functions
 /// \tparam func function
@ -551,7 +565,7 @@ inline T VectorSHA512(const T& vec)
 #endif
 }
-#endif  // CRYPTOPP_POWER8_SHA_AVAILABLE
+#endif  // POWER8 crypto
 NAMESPACE_END
--- a/rijndael-simd.cpp
+++ b/rijndael-simd.cpp
@ -25,13 +25,6 @@
 #include "misc.h"
 #include "adv-simd.h"
 // We set CRYPTOPP_POWER8_CRYPTO_AVAILABLE based on compiler version.
 // If the crypto is not available, then we have to disable it here.
 #if !(defined(__CRYPTO) || defined(_ARCH_PWR8) || defined(_ARCH_PWR9))
 # undef CRYPTOPP_POWER8_CRYPTO_AVAILABLE
 # undef CRYPTOPP_POWER8_AES_AVAILABLE
 #endif
 #if (CRYPTOPP_AESNI_AVAILABLE)
 # include <smmintrin.h>
 # include <wmmintrin.h>
@ -68,6 +61,8 @@ extern const char RIJNDAEL_SIMD_FNAME[] = __FILE__;
 NAMESPACE_BEGIN(CryptoPP)
 // ************************* Feature Probes ************************* //
 #ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
 extern "C" {
    typedef void (*SigHandler)(int);
@ -142,6 +137,155 @@ bool CPU_ProbeAES()
 }
 #endif  // ARM32 or ARM64
 #if (CRYPTOPP_BOOL_PPC32 || CRYPTOPP_BOOL_PPC64)
 	bool CPU_ProbePower7()
 {
 #if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES)
    return false;
 #elif (CRYPTOPP_POWER7_AVAILABLE) || (CRYPTOPP_POWER8_AVAILABLE)
 # if defined(CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY)
    // longjmp and clobber warnings. Volatile is required.
    // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
    volatile int result = false;
    volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler);
    if (oldHandler == SIG_ERR)
        return false;
    volatile sigset_t oldMask;
    if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
        return false;
    if (setjmp(s_jmpSIGILL))
        result = false;
    else
    {
        // POWER7 added unaligned loads and store operations
        byte b1[19] = {255, 255, 255, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, b2[17];
        // Specifically call the VSX loads and stores
        #if defined(__xlc__) || defined(__xlC__)
        vec_xst(vec_xl(0, b1+3), 0, b2+1);
        #else
        vec_vsx_st(vec_vsx_ld(0, b1+3), 0, b2+1);
        #endif
        result = (0 == std::memcmp(b1+3, b2+1, 16));
    }
    sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
    signal(SIGILL, oldHandler);
    return result;
 # endif
 #else
    return false;
 #endif  // CRYPTOPP_POWER7_AVAILABLE
 }
 bool CPU_ProbePower8()
 {
 #if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES)
    return false;
 #elif (CRYPTOPP_POWER8_AVAILABLE)
 # if defined(CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY)
    // longjmp and clobber warnings. Volatile is required.
    // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
    volatile int result = true;
    volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler);
    if (oldHandler == SIG_ERR)
        return false;
    volatile sigset_t oldMask;
    if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
        return false;
    if (setjmp(s_jmpSIGILL))
        result = false;
    else
    {
        // POWER8 added 64-bit SIMD operations
        const word64 x = W64LIT(0xffffffffffffffff);
        word64 w1[2] = {x, x}, w2[2] = {4, 6}, w3[2];
        // Specifically call the VSX loads and stores
        #if defined(__xlc__) || defined(__xlC__)
        const uint64x2_p v1 = (uint64x2_p)vec_xl(0, (byte*)w1);
        const uint64x2_p v2 = (uint64x2_p)vec_xl(0, (byte*)w2);
        const uint64x2_p v3 = vec_add(v1, v2);  // 64-bit add
        vec_xst((uint8x16_p)v3, 0, (byte*)w3);
        #else
        const uint64x2_p v1 = (uint64x2_p)vec_vsx_ld(0, (byte*)w1);
        const uint64x2_p v2 = (uint64x2_p)vec_vsx_ld(0, (byte*)w2);
        const uint64x2_p v3 = vec_add(v1, v2);  // 64-bit add
        vec_vsx_st((uint8x16_p)v3, 0, (byte*)w3);
        #endif
        // Relies on integer wrap
        result = (w3[0] == 3 && w3[1] == 5);
    }
    sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
    signal(SIGILL, oldHandler);
    return result;
 # endif
 #else
    return false;
 #endif  // CRYPTOPP_POWER8_AVAILABLE
 }
 bool CPU_ProbeAES()
 {
 #if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES)
    return false;
 #elif (CRYPTOPP_POWER8_AES_AVAILABLE)
 # if defined(CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY)
    // longjmp and clobber warnings. Volatile is required.
    // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
    volatile int result = true;
    volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler);
    if (oldHandler == SIG_ERR)
        return false;
    volatile sigset_t oldMask;
    if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
        return false;
    if (setjmp(s_jmpSIGILL))
        result = false;
    else
    {
        byte key[16] = {0xA0, 0xFA, 0xFE, 0x17, 0x88, 0x54, 0x2c, 0xb1,
                        0x23, 0xa3, 0x39, 0x39, 0x2a, 0x6c, 0x76, 0x05};
        byte state[16] = {0x19, 0x3d, 0xe3, 0xb3, 0xa0, 0xf4, 0xe2, 0x2b,
                          0x9a, 0xc6, 0x8d, 0x2a, 0xe9, 0xf8, 0x48, 0x08};
        byte r[16] = {255}, z[16] = {};
        uint8x16_p k = (uint8x16_p)VectorLoad(0, key);
        uint8x16_p s = (uint8x16_p)VectorLoad(0, state);
        s = VectorEncrypt(s, k);
        s = VectorEncryptLast(s, k);
        s = VectorDecrypt(s, k);
        s = VectorDecryptLast(s, k);
        VectorStore(s, r);
        result = (0 != std::memcmp(r, z, 16));
    }
    sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
    signal(SIGILL, oldHandler);
    return result;
 # endif
 #else
    return false;
 #endif  // CRYPTOPP_POWER8_AES_AVAILABLE
 }
 #endif  // PPC32 or PPC64
 // ***************************** ARMv8 ***************************** //
 #if (CRYPTOPP_ARM_AES_AVAILABLE)
--- a/sha-simd.cpp
+++ b/sha-simd.cpp
@ -185,6 +185,96 @@ bool CPU_ProbeSHA2()
 }
 #endif  // ARM32 or ARM64
 #if (CRYPTOPP_BOOL_PPC32 || CRYPTOPP_BOOL_PPC64)
 bool CPU_ProbeSHA256()
 {
 #if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES)
    return false;
 #elif (CRYPTOPP_POWER8_AVAILABLE)
 # if defined(CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY)
    // longjmp and clobber warnings. Volatile is required.
    // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
    volatile int result = false;
    volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler);
    if (oldHandler == SIG_ERR)
        return false;
    volatile sigset_t oldMask;
    if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
        return false;
    if (setjmp(s_jmpSIGILL))
        result = false;
    else
    {
        byte r[16], z[16] = {0};
        uint8x16_p x = ((uint8x16_p){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0});
        x = VectorSHA256<0,0>(x);
        x = VectorSHA256<0,1>(x);
        x = VectorSHA256<1,0>(x);
        x = VectorSHA256<1,1>(x);
        VectorStore(x, r);
        result = (0 == std::memcmp(r, z, 16));
    }
    sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
    signal(SIGILL, oldHandler);
    return result;
 # endif
 #else
    return false;
 #endif  // CRYPTOPP_ALTIVEC_AVAILABLE
 }
 bool CPU_ProbeSHA512()
 {
 #if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES)
    return false;
 #elif (CRYPTOPP_POWER8_AVAILABLE)
 # if defined(CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY)
    // longjmp and clobber warnings. Volatile is required.
    // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
    volatile int result = false;
    volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler);
    if (oldHandler == SIG_ERR)
        return false;
    volatile sigset_t oldMask;
    if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
        return false;
    if (setjmp(s_jmpSIGILL))
        result = false;
    else
    {
        byte r[16], z[16] = {0};
        uint8x16_p x = ((uint8x16_p){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0});
        x = VectorSHA512<0,0>(x);
        x = VectorSHA512<0,1>(x);
        x = VectorSHA512<1,0>(x);
        x = VectorSHA512<1,1>(x);
        VectorStore(x, r);
        result = (0 == std::memcmp(r, z, 16));
    }
    sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
    signal(SIGILL, oldHandler);
    return result;
 # endif
 #else
    return false;
 #endif  // CRYPTOPP_POWER8_AVAILABLE
 }
 #endif  // PPC32 or PPC64
 // ***************** Intel x86 SHA ********************
 // provided by sha.cpp, 16-byte aigned