Make Altivec vector wraps friendly to downgrades

The way the existing ppc_simd.h is written makes it hard to to switch between the old Altivec loads and stores and the new POWER7 loads and stores. This checkin rewrites the wrappers to use _ALTIVEC_, _ARCH_PWR7 and _ARCH_PWR8. The wrappers in this file now honor -maltivec, -mcpu-power7 and -mcpu=power8. It allows users to compile a source file, like chacha_simd.cpp, with a lower ISA and things just work for them.
2018-11-15 02:11:00 -05:00 · 2018-11-15 02:11:00 -05:00 · 10f85d6596
parent 3c7bdf1a26
commit 10f85d6596
3 changed files with 468 additions and 444 deletions
--- a/chacha.cpp
+++ b/chacha.cpp
@ -24,7 +24,7 @@ extern void ChaCha_OperateKeystream_SSE2(const word32 *state, const byte* input,
 extern void ChaCha_OperateKeystream_AVX2(const word32 *state, const byte* input, byte *output, unsigned int rounds);
 #endif

-#if (CRYPTOPP_POWER7_AVAILABLE)
+#if (CRYPTOPP_ALTIVEC_AVAILABLE)
 extern void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte *output, unsigned int rounds);
 #endif

@ -85,6 +85,11 @@ std::string ChaCha_Policy::AlgorithmProvider() const
    if (HasPower7())
        return "Power7";
    else
+#endif
+#if (CRYPTOPP_ALTIVEC_AVAILABLE)
+    if (HasAltivec())
+        return "Altivec";
+    else
 #endif
    return "C++";
 }
@ -139,8 +144,8 @@ unsigned int ChaCha_Policy::GetAlignment() const
        return 16;
    else
 #endif
-#if (CRYPTOPP_POWER7_AVAILABLE)
-    if (HasPower7())
+#if (CRYPTOPP_ALTIVEC_AVAILABLE)
+    if (HasAltivec())
        return 16;
    else
 #endif
@ -164,8 +169,8 @@ unsigned int ChaCha_Policy::GetOptimalBlockSize() const
        return 4*BYTES_PER_ITERATION;
    else
 #endif
-#if (CRYPTOPP_POWER7_AVAILABLE)
-    if (HasPower7())
+#if (CRYPTOPP_ALTIVEC_AVAILABLE)
+    if (HasAltivec())
        return 4*BYTES_PER_ITERATION;
    else
 #endif
@ -245,8 +250,8 @@ void ChaCha_Policy::OperateKeystream(KeystreamOperation operation,
        }
 #endif

-#if (CRYPTOPP_POWER7_AVAILABLE)
-        if (HasPower7())
+#if (CRYPTOPP_ALTIVEC_AVAILABLE)
+        if (HasAltivec())
        {
            while (iterationCount >= 4 && MultiBlockSafe(4))
            {
--- a/chacha_simd.cpp
+++ b/chacha_simd.cpp
@ -2,7 +2,7 @@
 //                   Jack Lloyd and Jeffrey Walton
 //
 //    This source file uses intrinsics and built-ins to gain access to
-//    SSE2, ARM NEON and ARMv8a, and Power7 Altivec instructions. A separate
+//    SSE2, ARM NEON and ARMv8a, Power7 and Altivec instructions. A separate
 //    source file is needed because additional CXXFLAGS are required to enable
 //    the appropriate instructions sets in some build configurations.
 //
@ -54,7 +54,7 @@
 # include <arm_acle.h>
 #endif

-#if defined(CRYPTOPP_POWER7_AVAILABLE)
+#if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
 # include "ppc_simd.h"
 #endif

@ -201,25 +201,24 @@ inline __m128i RotateLeft<16>(const __m128i val)

 #endif  // CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE

-// **************************** POWER7 **************************** //
+// **************************** Altivec **************************** //

-#if (CRYPTOPP_POWER7_AVAILABLE)
+#if (CRYPTOPP_ALTIVEC_AVAILABLE)

-// POWER8 is optional and runs about 0.6 cpb faster because
-// of the native 64-bit vector add. That's about 700 MB/s on
-// GCC112 from the compile farm. Use -mcpu=power8 to engage
-// POWER8. POWER7 lacks 64-bit element support, so code built
-// with -mcpu=power8 will SIGILL on POWER7 machines.
+// ChaCha_OperateKeystream_POWER7 is optimized for POWER7. However, Altivec
+// is supported by using vec_ld and vec_st, and using a composite vec_add
+// that supports 64-bit element adds. vec_ld and vec_st add significant
+// overhead when memory is not aligned. Despite the drawbacks Altivec
+// is profitable. The numbers for ChaCha8 are:
+//
+//   PowerMac, C++, 2.0 GHz: 205 MB/s, 9.29 cpb
+//   PowerMac, Altivec, 2.0 GHz: 471 MB/s, 4.09 cpb

 using CryptoPP::uint8x16_p;
 using CryptoPP::uint32x4_p;
 using CryptoPP::VectorLoad;
 using CryptoPP::VectorStore;

-#if (_ARCH_PWR8 || _ARCH_PWR9)
-using CryptoPP::uint64x2_p;
-#endif
-
 // Permutes bytes in packed 32-bit words to little endian.
 // State is already in proper endian order. Input and
 // output must be permuted during load and save.
@ -241,28 +240,12 @@ inline void VectorStore32LE(uint8_t dest[16], const uint32x4_p& val)
 {
 #if (CRYPTOPP_BIG_ENDIAN)
    const uint8x16_p mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
-    VectorStore(dest, vec_perm(val, val, mask));
+    VectorStore(vec_perm(val, val, mask), dest);
 #else
-    return VectorStore(dest, val);
+    return VectorStore(val, dest);
 #endif
 }

-// Rotate packed 32-bit words left by bit count
-template<unsigned int C>
-inline uint32x4_p RotateLeft(const uint32x4_p val)
-{
-    const uint32x4_p m = {C, C, C, C};
-    return vec_rl(val, m);
-}
-
-// Rotate packed 32-bit words right by bit count
-template<unsigned int C>
-inline uint32x4_p RotateRight(const uint32x4_p val)
-{
-    const uint32x4_p m = {32-C, 32-C, 32-C, 32-C};
-    return vec_rl(val, m);
-}
-
 // ChaCha's use of x86 shuffle is really a 4, 8, or 12 byte
 // rotation on the 128-bit vector word:
 //   * [3,2,1,0] => [0,3,2,1] is Shuffle<1>(x)
@ -296,25 +279,7 @@ inline uint32x4_p Shuffle<3>(const uint32x4_p& val)
    return vec_perm(val, val, mask);
 }

-// Helper to perform 64-bit addition across two elements of 32-bit vectors
-inline uint32x4_p VectorAdd64(const uint32x4_p& a, const uint32x4_p& b)
-{
-#if (_ARCH_PWR8 || _ARCH_PWR9)
-    return (uint32x4_p)vec_add((uint64x2_p)a, (uint64x2_p)b);
-#else
-    // The carry mask selects carries from elements 1 and 3 and sets remaining
-    // elements to 0. The mask also shifts the carried values left by 4 bytes
-    // so the carries are added to elements 0 and 2.
-    const uint8x16_p cmask = {4,5,6,7, 16,16,16,16, 12,13,14,15, 16,16,16,16};
-    const uint32x4_p zero = {0, 0, 0, 0};
-
-    uint32x4_p cy = vec_addc(a, b);
-    cy = vec_perm(cy, zero, cmask);
-    return vec_add(vec_add(a, b), cy);
-#endif
-}
-
-#endif  // CRYPTOPP_POWER7_AVAILABLE
+#endif  // CRYPTOPP_ALTIVEC_AVAILABLE

 ANONYMOUS_NAMESPACE_END

@ -856,7 +821,7 @@ void ChaCha_OperateKeystream_SSE2(const word32 *state, const byte* input, byte *

 #endif  // CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE

-#if (CRYPTOPP_POWER7_AVAILABLE)
+#if (CRYPTOPP_ALTIVEC_AVAILABLE)

 void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte *output, unsigned int rounds)
 {
@ -901,10 +866,10 @@ void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte
        r2_3 = VectorXor(r2_3, r2_0);
        r3_3 = VectorXor(r3_3, r3_0);

-        r0_3 = RotateLeft<16>(r0_3);
-        r1_3 = RotateLeft<16>(r1_3);
-        r2_3 = RotateLeft<16>(r2_3);
-        r3_3 = RotateLeft<16>(r3_3);
+        r0_3 = VectorRotateLeft<16>(r0_3);
+        r1_3 = VectorRotateLeft<16>(r1_3);
+        r2_3 = VectorRotateLeft<16>(r2_3);
+        r3_3 = VectorRotateLeft<16>(r3_3);

        r0_2 = VectorAdd(r0_2, r0_3);
        r1_2 = VectorAdd(r1_2, r1_3);
@ -916,10 +881,10 @@ void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte
        r2_1 = VectorXor(r2_1, r2_2);
        r3_1 = VectorXor(r3_1, r3_2);

-        r0_1 = RotateLeft<12>(r0_1);
-        r1_1 = RotateLeft<12>(r1_1);
-        r2_1 = RotateLeft<12>(r2_1);
-        r3_1 = RotateLeft<12>(r3_1);
+        r0_1 = VectorRotateLeft<12>(r0_1);
+        r1_1 = VectorRotateLeft<12>(r1_1);
+        r2_1 = VectorRotateLeft<12>(r2_1);
+        r3_1 = VectorRotateLeft<12>(r3_1);

        r0_0 = VectorAdd(r0_0, r0_1);
        r1_0 = VectorAdd(r1_0, r1_1);
@ -931,10 +896,10 @@ void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte
        r2_3 = VectorXor(r2_3, r2_0);
        r3_3 = VectorXor(r3_3, r3_0);

-        r0_3 = RotateLeft<8>(r0_3);
-        r1_3 = RotateLeft<8>(r1_3);
-        r2_3 = RotateLeft<8>(r2_3);
-        r3_3 = RotateLeft<8>(r3_3);
+        r0_3 = VectorRotateLeft<8>(r0_3);
+        r1_3 = VectorRotateLeft<8>(r1_3);
+        r2_3 = VectorRotateLeft<8>(r2_3);
+        r3_3 = VectorRotateLeft<8>(r3_3);

        r0_2 = VectorAdd(r0_2, r0_3);
        r1_2 = VectorAdd(r1_2, r1_3);
@ -946,10 +911,10 @@ void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte
        r2_1 = VectorXor(r2_1, r2_2);
        r3_1 = VectorXor(r3_1, r3_2);

-        r0_1 = RotateLeft<7>(r0_1);
-        r1_1 = RotateLeft<7>(r1_1);
-        r2_1 = RotateLeft<7>(r2_1);
-        r3_1 = RotateLeft<7>(r3_1);
+        r0_1 = VectorRotateLeft<7>(r0_1);
+        r1_1 = VectorRotateLeft<7>(r1_1);
+        r2_1 = VectorRotateLeft<7>(r2_1);
+        r3_1 = VectorRotateLeft<7>(r3_1);

        r0_1 = Shuffle<1>(r0_1);
        r0_2 = Shuffle<2>(r0_2);
@ -977,10 +942,10 @@ void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte
        r2_3 = VectorXor(r2_3, r2_0);
        r3_3 = VectorXor(r3_3, r3_0);

-        r0_3 = RotateLeft<16>(r0_3);
-        r1_3 = RotateLeft<16>(r1_3);
-        r2_3 = RotateLeft<16>(r2_3);
-        r3_3 = RotateLeft<16>(r3_3);
+        r0_3 = VectorRotateLeft<16>(r0_3);
+        r1_3 = VectorRotateLeft<16>(r1_3);
+        r2_3 = VectorRotateLeft<16>(r2_3);
+        r3_3 = VectorRotateLeft<16>(r3_3);

        r0_2 = VectorAdd(r0_2, r0_3);
        r1_2 = VectorAdd(r1_2, r1_3);
@ -992,10 +957,10 @@ void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte
        r2_1 = VectorXor(r2_1, r2_2);
        r3_1 = VectorXor(r3_1, r3_2);

-        r0_1 = RotateLeft<12>(r0_1);
-        r1_1 = RotateLeft<12>(r1_1);
-        r2_1 = RotateLeft<12>(r2_1);
-        r3_1 = RotateLeft<12>(r3_1);
+        r0_1 = VectorRotateLeft<12>(r0_1);
+        r1_1 = VectorRotateLeft<12>(r1_1);
+        r2_1 = VectorRotateLeft<12>(r2_1);
+        r3_1 = VectorRotateLeft<12>(r3_1);

        r0_0 = VectorAdd(r0_0, r0_1);
        r1_0 = VectorAdd(r1_0, r1_1);
@ -1007,10 +972,10 @@ void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte
        r2_3 = VectorXor(r2_3, r2_0);
        r3_3 = VectorXor(r3_3, r3_0);

-        r0_3 = RotateLeft<8>(r0_3);
-        r1_3 = RotateLeft<8>(r1_3);
-        r2_3 = RotateLeft<8>(r2_3);
-        r3_3 = RotateLeft<8>(r3_3);
+        r0_3 = VectorRotateLeft<8>(r0_3);
+        r1_3 = VectorRotateLeft<8>(r1_3);
+        r2_3 = VectorRotateLeft<8>(r2_3);
+        r3_3 = VectorRotateLeft<8>(r3_3);

        r0_2 = VectorAdd(r0_2, r0_3);
        r1_2 = VectorAdd(r1_2, r1_3);
@ -1022,10 +987,10 @@ void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte
        r2_1 = VectorXor(r2_1, r2_2);
        r3_1 = VectorXor(r3_1, r3_2);

-        r0_1 = RotateLeft<7>(r0_1);
-        r1_1 = RotateLeft<7>(r1_1);
-        r2_1 = RotateLeft<7>(r2_1);
-        r3_1 = RotateLeft<7>(r3_1);
+        r0_1 = VectorRotateLeft<7>(r0_1);
+        r1_1 = VectorRotateLeft<7>(r1_1);
+        r2_1 = VectorRotateLeft<7>(r2_1);
+        r3_1 = VectorRotateLeft<7>(r3_1);

        r0_1 = Shuffle<3>(r0_1);
        r0_2 = Shuffle<2>(r0_2);
@ -1120,6 +1085,6 @@ void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte
    VectorStore32LE(output + 15*16, r3_3);
 }

-#endif  // CRYPTOPP_POWER7_AVAILABLE
+#endif  // CRYPTOPP_ALTIVEC_AVAILABLE

 NAMESPACE_END
--- a/ppc_simd.h
+++ b/ppc_simd.h
@ -13,19 +13,15 @@
 ///   provide best support and don't need many of the little hacks below.
 /// \since Crypto++ 6.0

+// Use __ALTIVEC__, _ARCH_PWR7 and _ARCH_PWR8. The preprocessor macros
+// depend on compiler options like -maltivec (and not compiler versions).
+
 #ifndef CRYPTOPP_PPC_CRYPTO_H
 #define CRYPTOPP_PPC_CRYPTO_H

 #include "config.h"
 #include "misc.h"

-// We are boxed into undefining macros like CRYPTOPP_POWER8_AVAILABLE.
-// We set CRYPTOPP_POWER8_AVAILABLE based on compiler versions because
-// we needed them for the SIMD and non-SIMD files. When the SIMD file is
-// compiled it may only get -mcpu=power4 or -mcpu=power7, so the POWER7
-// or POWER8 stuff is not actually available when this header is included.
-// We also need to handle the case of -DCRYPTOPP_ALTIVEC_AVAILABLE=0.
-
 #if defined(__ALTIVEC__)
 # include <altivec.h>
 # undef vector
@ -33,6 +29,13 @@
 # undef bool
 #endif

+// VectorLoad_ALTIVEC and VectorStore_ALTIVEC are too noisy on modern compilers
+
+#if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wdeprecated"
+#endif
+
 NAMESPACE_BEGIN(CryptoPP)

 // Wrap everything in this file based on CRYPTOPP_ALTIVEC_AVAILABLE
@ -42,9 +45,10 @@ NAMESPACE_BEGIN(CryptoPP)
 typedef __vector unsigned char   uint8x16_p;
 typedef __vector unsigned short  uint16x8_p;
 typedef __vector unsigned int    uint32x4_p;
+
 #if defined(_ARCH_PWR8)
 typedef __vector unsigned long long uint64x2_p;
-#endif  // POWER8 datatypes
+#endif  // _ARCH_PWR8

 /// \brief Reverse a vector
 /// \tparam T vector type
@ -53,12 +57,323 @@ typedef __vector unsigned long long uint64x2_p;
 /// \details Reverse() endian swaps the bytes in a vector
 /// \since Crypto++ 6.0
 template <class T>
-inline T Reverse(const T& src)
+inline T Reverse(const T src)
 {
    const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
    return (T)vec_perm(src, src, mask);
 }

+//////////////////////// Loads ////////////////////////
+
+/// \brief Loads a vector from a byte array
+/// \param src the byte array
+/// \details Loads a vector in native endian format from a byte array.
+/// \note VectorLoad does not require an aligned array.
+/// \since Crypto++ 6.0
+inline uint32x4_p VectorLoad_ALTIVEC(const byte src[16])
+{
+    if (IsAlignedOn(src, 16))
+    {
+        return (uint32x4_p)vec_ld(0, src);
+    }
+    else
+    {
+        // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
+        const uint8x16_p perm = vec_lvsl(0, src);
+        const uint8x16_p low = vec_ld(0, src);
+        const uint8x16_p high = vec_ld(15, src);
+        return (uint32x4_p)vec_perm(low, high, perm);
+    }
+}
+
+/// \brief Loads a vector from a byte array
+/// \param src the byte array
+/// \param off offset into the src byte array
+/// \details Loads a vector in native endian format from a byte array.
+/// \note VectorLoad does not require an aligned array.
+/// \since Crypto++ 6.0
+inline uint32x4_p VectorLoad_ALTIVEC(int off, const byte src[16])
+{
+    if (IsAlignedOn(src, 16))
+    {
+        return (uint32x4_p)vec_ld(off, src);
+    }
+    else
+    {
+        // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
+        const uint8x16_p perm = vec_lvsl(off, src);
+        const uint8x16_p low = vec_ld(off, src);
+        const uint8x16_p high = vec_ld(15, src);
+        return (uint32x4_p)vec_perm(low, high, perm);
+    }
+}
+
+/// \brief Loads a vector from a byte array
+/// \param src the byte array
+/// \details Loads a vector in native endian format from a byte array.
+/// \note VectorLoad does not require an aligned array.
+/// \since Crypto++ 6.0
+inline uint32x4_p VectorLoad(const byte src[16])
+{
+#if defined(_ARCH_PWR7)
+#  if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
+    return (uint32x4_p)vec_xl(0, (byte*)src);
+#  else
+    return (uint32x4_p)vec_vsx_ld(0, (byte*)src);
+#  endif
+#else
+    return VectorLoad_ALTIVEC(src);
+#endif
+}
+
+/// \brief Loads a vector from a byte array
+/// \param src the byte array
+/// \param off offset into the byte array
+/// \details Loads a vector in native endian format from a byte array.
+/// \note VectorLoad does not require an aligned array.
+/// \since Crypto++ 6.0
+inline uint32x4_p VectorLoad(int off, const byte src[16])
+{
+#if defined(_ARCH_PWR7)
+#  if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
+    return (uint32x4_p)vec_xl(off, (byte*)src);
+#  else
+    return (uint32x4_p)vec_vsx_ld(off, (byte*)src);
+#  endif
+#else
+    return VectorLoad_ALTIVEC(off, src);
+#endif
+}
+
+/// \brief Loads a vector from a byte array
+/// \param src the byte array
+/// \param off offset into the byte array
+/// \details Loads a vector in native endian format from a byte array.
+/// \note VectorLoad does not require an aligned array.
+/// \since Crypto++ 6.0
+inline uint32x4_p VectorLoad(const word32 src[4])
+{
+    return VectorLoad((const byte*)src);
+}
+
+/// \brief Loads a vector from a byte array
+/// \param src the byte array
+/// \param off offset into the byte array
+/// \details Loads a vector in native endian format from a byte array.
+/// \note VectorLoad does not require an aligned array.
+/// \since Crypto++ 6.0
+inline uint32x4_p VectorLoad(int off, const word32 src[4])
+{
+    return VectorLoad(off, (const byte*)src);
+}
+
+/// \brief Loads a vector from a byte array
+/// \param src the byte array
+/// \details Loads a vector in big endian format from a byte array.
+///   VectorLoadBE will swap all bytes on little endian systems.
+/// \note VectorLoadBE() does not require an aligned array.
+/// \since Crypto++ 6.0
+inline uint32x4_p VectorLoadBE(const byte src[16])
+{
+#if defined(_ARCH_PWR7)
+#  if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
+       return (uint32x4_p)vec_xl_be(0, (byte*)src);
+#  else
+#    if (CRYPTOPP_BIG_ENDIAN)
+       return (uint32x4_p)vec_vsx_ld(0, (byte*)src);
+#    else
+       return (uint32x4_p)Reverse(vec_vsx_ld(0, (byte*)src));
+#    endif
+#  endif
+#else  // _ARCH_PWR7
+#  if (CRYPTOPP_BIG_ENDIAN)
+     return (uint32x4_p)VectorLoad((const byte*)src);
+#  else
+     return (uint32x4_p)Reverse(VectorLoad((const byte*)src));
+#  endif
+#endif  // _ARCH_PWR7
+}
+
+/// \brief Loads a vector from a byte array
+/// \param src the byte array
+/// \param off offset into the src byte array
+/// \details Loads a vector in big endian format from a byte array.
+///   VectorLoadBE will swap all bytes on little endian systems.
+/// \note VectorLoadBE does not require an aligned array.
+/// \since Crypto++ 6.0
+inline uint32x4_p VectorLoadBE(int off, const byte src[16])
+{
+#if defined(_ARCH_PWR7)
+#  if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
+       return (uint32x4_p)vec_xl_be(off, (byte*)src);
+#  else
+#    if (CRYPTOPP_BIG_ENDIAN)
+       return (uint32x4_p)vec_vsx_ld(off, (byte*)src);
+#    else
+       return (uint32x4_p)Reverse(vec_vsx_ld(off, (byte*)src));
+#    endif
+#  endif
+#else  // _ARCH_PWR7
+#  if (CRYPTOPP_BIG_ENDIAN)
+     return (uint32x4_p)VectorLoad(off, (const byte*)src);
+#  else
+     return (uint32x4_p)Reverse(VectorLoad(off, (const byte*)src));
+#  endif
+#endif  // _ARCH_PWR7
+}
+
+//////////////////////// Stores ////////////////////////
+
+template<class T>
+inline void VectorStore_ALTIVEC(const T data, byte dest[16])
+{
+    if (IsAlignedOn(dest, 16))
+    {
+        vec_st((uint8x16_p)data, 0,  dest);
+    }
+    else
+    {
+        // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
+        uint8x16_p perm = (uint8x16_p)vec_perm(data, data, vec_lvsr(0, dest));
+        vec_ste((uint8x16_p) perm,  0, (unsigned char*) dest);
+        vec_ste((uint16x8_p) perm,  1, (unsigned short*)dest);
+        vec_ste((uint32x4_p) perm,  3, (unsigned int*)  dest);
+        vec_ste((uint32x4_p) perm,  4, (unsigned int*)  dest);
+        vec_ste((uint32x4_p) perm,  8, (unsigned int*)  dest);
+        vec_ste((uint32x4_p) perm, 12, (unsigned int*)  dest);
+        vec_ste((uint16x8_p) perm, 14, (unsigned short*)dest);
+        vec_ste((uint8x16_p) perm, 15, (unsigned char*) dest);
+    }
+}
+
+template<class T>
+inline void VectorStore_ALTIVEC(const T data, int off, byte dest[16])
+{
+    if (IsAlignedOn(dest, 16))
+    {
+        vec_st((uint8x16_p)data, off,  dest);
+    }
+    else
+    {
+        // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
+        uint8x16_p perm = (uint8x16_p)vec_perm(data, data, vec_lvsr(off, dest));
+        vec_ste((uint8x16_p) perm,  0, (unsigned char*) dest);
+        vec_ste((uint16x8_p) perm,  1, (unsigned short*)dest);
+        vec_ste((uint32x4_p) perm,  3, (unsigned int*)  dest);
+        vec_ste((uint32x4_p) perm,  4, (unsigned int*)  dest);
+        vec_ste((uint32x4_p) perm,  8, (unsigned int*)  dest);
+        vec_ste((uint32x4_p) perm, 12, (unsigned int*)  dest);
+        vec_ste((uint16x8_p) perm, 14, (unsigned short*)dest);
+        vec_ste((uint8x16_p) perm, 15, (unsigned char*) dest);
+    }
+}
+
+/// \brief Stores a vector to a byte array
+/// \tparam T vector type
+/// \param data the vector
+/// \param dest the byte array
+/// \details Stores a vector in native endian format to a byte array.
+/// \note VectorStore does not require an aligned array.
+/// \since Crypto++ 6.0
+template<class T>
+inline void VectorStore(const T data, byte dest[16])
+{
+#if defined(_ARCH_PWR7)
+#  if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
+    vec_xst((uint8x16_p)data, 0, (byte*)dest);
+#  else
+    vec_vsx_st((uint8x16_p)data, 0, (byte*)dest);
+#  endif
+#else
+    return VectorStore_ALTIVEC(data, 0, dest);
+#endif
+}
+
+/// \brief Stores a vector to a byte array
+/// \tparam T vector type
+/// \param data the vector
+/// \param off the byte offset into the array
+/// \param dest the byte array
+/// \details Stores a vector in native endian format to a byte array.
+/// \note VectorStore does not require an aligned array.
+/// \since Crypto++ 6.0
+template<class T>
+inline void VectorStore(const T data, int off, byte dest[16])
+{
+#if defined(_ARCH_PWR7)
+#  if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
+    vec_xst((uint8x16_p)data, off, (byte*)dest);
+#  else
+    vec_vsx_st((uint8x16_p)data, off, (byte*)dest);
+#  endif
+#else
+    return VectorStore_ALTIVEC(data, off, dest);
+#endif
+}
+
+/// \brief Stores a vector to a byte array
+/// \tparam T vector type
+/// \param src the vector
+/// \param dest the byte array
+/// \details Stores a vector in big endian format to a byte array.
+///   VectorStoreBE will swap all bytes on little endian systems.
+/// \note VectorStoreBE does not require an aligned array.
+/// \since Crypto++ 6.0
+template <class T>
+inline void VectorStoreBE(const T src, byte dest[16])
+{
+#if defined(_ARCH_PWR7)
+#  if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
+     vec_xst_be((uint8x16_p)src, 0, (byte*)dest);
+#  else
+#    if (CRYPTOPP_BIG_ENDIAN)
+       vec_vsx_st((uint8x16_p)src, 0, (byte*)dest);
+#    else
+       vec_vsx_st((uint8x16_p)Reverse(src), 0, (byte*)dest);
+#    endif
+#  endif
+#else  // _ARCH_PWR7
+#  if (CRYPTOPP_BIG_ENDIAN)
+     VectorStore((uint8x16_p)src, (byte*)dest);
+#  else
+     VectorStore((uint8x16_p)Reverse(src), (byte*)dest);
+#  endif
+#endif  // _ARCH_PWR7
+}
+
+/// \brief Stores a vector to a byte array
+/// \tparam T vector type
+/// \param src the vector
+/// \param off offset into the dest byte array
+/// \param dest the byte array
+/// \details Stores a vector in big endian format to a byte array.
+///   VectorStoreBE will swap all bytes on little endian systems.
+/// \note VectorStoreBE does not require an aligned array.
+/// \since Crypto++ 6.0
+template <class T>
+inline void VectorStoreBE(const T src, int off, byte dest[16])
+{
+#if defined(_ARCH_PWR7)
+#  if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
+     vec_xst_be((uint8x16_p)src, off, (byte*)dest);
+#  else
+#    if (CRYPTOPP_BIG_ENDIAN)
+       vec_vsx_st((uint8x16_p)src, off, (byte*)dest);
+#    else
+       vec_vsx_st((uint8x16_p)Reverse(src), off, (byte*)dest);
+#    endif
+#  endif
+#else  // _ARCH_PWR7
+#  if (CRYPTOPP_BIG_ENDIAN)
+     VectorStore((uint8x16_p)src, off, (byte*)dest);
+#  else
+     VectorStore((uint8x16_p)Reverse(src), off, (byte*)dest);
+#  endif
+#endif  // _ARCH_PWR7
+}
+
+//////////////////////// Miscellaneous ////////////////////////
+
 /// \brief Permutes a vector
 /// \tparam T vector type
 /// \param vec the vector
@ -69,7 +384,7 @@ inline T Reverse(const T& src)
 ///   vector is the same type as vec.
 /// \since Crypto++ 6.0
 template <class T1, class T2>
-inline T1 VectorPermute(const T1& vec, const T2& mask)
+inline T1 VectorPermute(const T1 vec, const T2 mask)
 {
    return (T1)vec_perm(vec, vec, (uint8x16_p)mask);
 }
@ -86,7 +401,7 @@ inline T1 VectorPermute(const T1& vec, const T2& mask)
 ///   vector is the same type as vec1.
 /// \since Crypto++ 6.0
 template <class T1, class T2>
-inline T1 VectorPermute(const T1& vec1, const T1& vec2, const T2& mask)
+inline T1 VectorPermute(const T1 vec1, const T1 vec2, const T2 mask)
 {
    return (T1)vec_perm(vec1, vec2, (uint8x16_p)mask);
 }
@ -101,7 +416,7 @@ inline T1 VectorPermute(const T1& vec1, const T1& vec2, const T2& mask)
 ///   vector is the same type as vec1.
 /// \since Crypto++ 6.0
 template <class T1, class T2>
-inline T1 VectorAnd(const T1& vec1, const T2& vec2)
+inline T1 VectorAnd(const T1 vec1, const T2 vec2)
 {
    return (T1)vec_and(vec1, (T1)vec2);
 }
@ -116,7 +431,7 @@ inline T1 VectorAnd(const T1& vec1, const T2& vec2)
 ///   vector is the same type as vec1.
 /// \since Crypto++ 6.0
 template <class T1, class T2>
-inline T1 VectorOr(const T1& vec1, const T2& vec2)
+inline T1 VectorOr(const T1 vec1, const T2 vec2)
 {
    return (T1)vec_or(vec1, (T1)vec2);
 }
@ -131,7 +446,7 @@ inline T1 VectorOr(const T1& vec1, const T2& vec2)
 ///   vector is the same type as vec1.
 /// \since Crypto++ 6.0
 template <class T1, class T2>
-inline T1 VectorXor(const T1& vec1, const T2& vec2)
+inline T1 VectorXor(const T1 vec1, const T2 vec2)
 {
    return (T1)vec_xor(vec1, (T1)vec2);
 }
@ -147,7 +462,7 @@ inline T1 VectorXor(const T1& vec1, const T2& vec2)
 ///   is the same type as vec1.
 /// \since Crypto++ 6.0
 template <class T1, class T2>
-inline T1 VectorAdd(const T1& vec1, const T2& vec2)
+inline T1 VectorAdd(const T1 vec1, const T2 vec2)
 {
    return (T1)vec_add(vec1, (T1)vec2);
 }
@ -162,11 +477,37 @@ inline T1 VectorAdd(const T1& vec1, const T2& vec2)
 ///   is the same type as vec1.
 /// \since Crypto++ 6.0
 template <class T1, class T2>
-inline T1 VectorSub(const T1& vec1, const T2& vec2)
+inline T1 VectorSub(const T1 vec1, const T2 vec2)
 {
    return (T1)vec_sub(vec1, (T1)vec2);
 }

+/// \brief Add two vectors
+/// \tparam T1 vector type
+/// \tparam T2 vector type
+/// \param vec1 the first vector
+/// \param vec2 the second vector
+/// \returns vector
+/// \details VectorAdd64 returns a new vector from vec1 and vec2.
+///   vec1 and vec2 are added as uint64x2_p quantities.
+/// \since Crypto++ 8.0
+inline uint32x4_p VectorAdd64(const uint32x4_p& vec1, const uint32x4_p& vec2)
+{
+#if defined(_ARCH_PWR8)
+    return (uint32x4_p)vec_add((uint64x2_p)vec1, (uint64x2_p)vec2);
+#else
+    // The carry mask selects carries from elements 1 and 3 and sets remaining
+    // elements to 0. The mask also shifts the carried values left by 4 bytes
+    // so the carries are added to elements 0 and 2.
+    const uint8x16_p cmask = {4,5,6,7, 16,16,16,16, 12,13,14,15, 16,16,16,16};
+    const uint32x4_p zero = {0, 0, 0, 0};
+
+    uint32x4_p cy = vec_addc(vec1, vec2);
+    cy = vec_perm(cy, zero, cmask);
+    return vec_add(vec_add(vec1, vec2), cy);
+#endif
+}
+
 /// \brief Shift a vector left
 /// \tparam C shift byte count
 /// \tparam T vector type
@ -180,14 +521,14 @@ inline T1 VectorSub(const T1& vec1, const T2& vec2)
 ///   <tt>vec_sld(z, a, 16-c)</tt>. You should always call the function as
 ///   if on a big endian machine as shown below.
 /// <pre>
-///    uint8x16_p r1 = VectorLoad(ptr);
-///    uint8x16_p r5 = VectorShiftLeftOctet<12>(r1);
+///    uint8x16_p x = VectorLoad(ptr);
+///    uint8x16_p y = VectorShiftLeftOctet<12>(x);
 /// </pre>
 /// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
 ///   endian sensitive?</A> on Stack Overflow
 /// \since Crypto++ 6.0
 template <unsigned int C, class T>
-inline T VectorShiftLeftOctet(const T& vec)
+inline T VectorShiftLeftOctet(const T vec)
 {
    const T zero = {0};
    if (C >= 16)
@ -223,14 +564,14 @@ inline T VectorShiftLeftOctet(const T& vec)
 ///   <tt>vec_sld(z, a, 16-c)</tt>. You should always call the function as
 ///   if on a big endian machine as shown below.
 /// <pre>
-///    uint8x16_p r1 = VectorLoad(ptr);
-///    uint8x16_p r5 = VectorShiftRightOctet<12>(r1);
+///    uint8x16_p x = VectorLoad(ptr);
+///    uint8x16_p y = VectorShiftRightOctet<12>(y);
 /// </pre>
 /// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
 ///   endian sensitive?</A> on Stack Overflow
 /// \since Crypto++ 6.0
 template <unsigned int C, class T>
-inline T VectorShiftRightOctet(const T& vec)
+inline T VectorShiftRightOctet(const T vec)
 {
    const T zero = {0};
    if (C >= 16)
@ -265,7 +606,7 @@ inline T VectorShiftRightOctet(const T& vec)
 ///   endian sensitive?</A> on Stack Overflow
 /// \since Crypto++ 6.0
 template <unsigned int C, class T>
-inline T VectorRotateLeftOctet(const T& vec)
+inline T VectorRotateLeftOctet(const T vec)
 {
    enum { R = C&0xf };
 #if (CRYPTOPP_BIG_ENDIAN)
@ -287,7 +628,7 @@ inline T VectorRotateLeftOctet(const T& vec)
 ///   endian sensitive?</A> on Stack Overflow
 /// \since Crypto++ 6.0
 template <unsigned int C, class T>
-inline T VectorRotateRightOctet(const T& vec)
+inline T VectorRotateRightOctet(const T vec)
 {
    enum { R = C&0xf };
 #if (CRYPTOPP_BIG_ENDIAN)
@ -297,13 +638,37 @@ inline T VectorRotateRightOctet(const T& vec)
 #endif
 }

+/// \brief Rotate a vector left
+/// \tparam C shift bit count
+/// \param vec the vector
+/// \returns vector
+/// \details VectorRotateLeft rotates each element in a packed vector by bit count.
+template<unsigned int C>
+inline uint32x4_p VectorRotateLeft(const uint32x4_p vec)
+{
+    const uint32x4_p m = {C, C, C, C};
+    return vec_rl(vec, m);
+}
+
+/// \brief Rotate a vector right
+/// \tparam C shift bit count
+/// \param vec the vector
+/// \returns vector
+/// \details VectorRotateRight rotates each element in a packed vector by bit count.
+template<unsigned int C>
+inline uint32x4_p VectorRotateRight(const uint32x4_p vec)
+{
+    const uint32x4_p m = {32-C, 32-C, 32-C, 32-C};
+    return vec_rl(vec, m);
+}
+
 /// \brief Exchange high and low double words
 /// \tparam T vector type
 /// \param vec the vector
 /// \returns vector
 /// \since Crypto++ 7.0
 template <class T>
-inline T VectorSwapWords(const T& vec)
+inline T VectorSwapWords(const T vec)
 {
    return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, 8);
 }
@ -317,7 +682,7 @@ inline T VectorSwapWords(const T& vec)
 ///   when viewed as a big endian array. The return vector is the same type as
 ///   the original vector and padded with 0's in the most significant bit positions.
 template <class T>
-inline T VectorGetLow(const T& val)
+inline T VectorGetLow(const T val)
 {
    //const T zero = {0};
    //const uint8x16_p mask = {16,16,16,16, 16,16,16,16, 8,9,10,11, 12,13,14,15 };
@ -334,7 +699,7 @@ inline T VectorGetLow(const T& val)
 ///   when viewed as a big endian array. The return vector is the same type as
 ///   the original vector and padded with 0's in the most significant bit positions.
 template <class T>
-inline T VectorGetHigh(const T& val)
+inline T VectorGetHigh(const T val)
 {
    //const T zero = {0};
    //const uint8x16_p mask = {16,16,16,16, 16,16,16,16, 0,1,2,3, 4,5,6,7 };
@ -349,7 +714,7 @@ inline T VectorGetHigh(const T& val)
 /// \param vec2 the second vector
 /// \returns true if vec1 equals vec2, false otherwise
 template <class T1, class T2>
-inline bool VectorEqual(const T1& vec1, const T2& vec2)
+inline bool VectorEqual(const T1 vec1, const T2 vec2)
 {
    return 1 == vec_all_eq((uint32x4_p)vec1, (uint32x4_p)vec2);
 }
@ -361,324 +726,13 @@ inline bool VectorEqual(const T1& vec1, const T2& vec2)
 /// \param vec2 the second vector
 /// \returns true if vec1 does not equal vec2, false otherwise
 template <class T1, class T2>
-inline bool VectorNotEqual(const T1& vec1, const T2& vec2)
+inline bool VectorNotEqual(const T1 vec1, const T2 vec2)
 {
    return 0 == vec_all_eq((uint32x4_p)vec1, (uint32x4_p)vec2);
 }

-// POWER7/POWER4 load and store
-#if defined(_ARCH_PWR7)
+//////////////////////// Power8 Crypto ////////////////////////

-/// \brief Loads a vector from a byte array
-/// \param src the byte array
-/// \details Loads a vector in big endian format from a byte array.
-///   VectorLoadBE will swap endianess on little endian systems.
-/// \note VectorLoadBE() does not require an aligned array.
-/// \since Crypto++ 6.0
-inline uint32x4_p VectorLoadBE(const byte src[16])
-{
-#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
-    return (uint32x4_p)vec_xl_be(0, (byte*)src);
-#else
-# if (CRYPTOPP_BIG_ENDIAN)
-    return (uint32x4_p)vec_vsx_ld(0, src);
-# else
-    return (uint32x4_p)Reverse(vec_vsx_ld(0, src));
-# endif
-#endif
-}
-
-/// \brief Loads a vector from a byte array
-/// \param src the byte array
-/// \param off offset into the src byte array
-/// \details Loads a vector in big endian format from a byte array.
-///   VectorLoadBE will swap endianess on little endian systems.
-/// \note VectorLoadBE does not require an aligned array.
-/// \since Crypto++ 6.0
-inline uint32x4_p VectorLoadBE(int off, const byte src[16])
-{
-#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
-    return (uint32x4_p)vec_xl_be(off, (byte*)src);
-#else
-# if (CRYPTOPP_BIG_ENDIAN)
-    return (uint32x4_p)vec_vsx_ld(off, (byte*)src);
-# else
-    return (uint32x4_p)Reverse(vec_vsx_ld(off, (byte*)src));
-# endif
-#endif
-}
-
-/// \brief Loads a vector from a byte array
-/// \param src the byte array
-/// \details Loads a vector in native endian format from a byte array.
-/// \note VectorLoad does not require an aligned array.
-/// \since Crypto++ 6.0
-inline uint32x4_p VectorLoad(const byte src[16])
-{
-#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
-    return (uint32x4_p)vec_xl(0, (byte*)src);
-#else
-    return (uint32x4_p)vec_vsx_ld(0, (byte*)src);
-#endif
-}
-
-/// \brief Loads a vector from a byte array
-/// \param src the byte array
-/// \param off offset into the src byte array
-/// \details Loads a vector in native endian format from a byte array.
-/// \note VectorLoad does not require an aligned array.
-/// \since Crypto++ 6.0
-inline uint32x4_p VectorLoad(int off, const byte src[16])
-{
-#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
-    return (uint32x4_p)vec_xl(off, (byte*)src);
-#else
-    return (uint32x4_p)vec_vsx_ld(off, (byte*)src);
-#endif
-}
-
-/// \brief Loads a vector from a word array
-/// \param src the word array
-/// \details Loads a vector in native endian format from a word array.
-/// \note VectorLoad does not require an aligned array.
-/// \since Crypto++ 6.0
-inline uint32x4_p VectorLoad(const word32 src[4])
-{
-    return VectorLoad((const byte*)src);
-}
-
-/// \brief Loads a vector from a word array
-/// \param src the word array
-/// \param off offset into the src word array
-/// \details Loads a vector in native endian format from a word array.
-/// \note VectorLoad does not require an aligned array.
-/// \since Crypto++ 6.0
-inline uint32x4_p VectorLoad(int off, const word32 src[4])
-{
-    return VectorLoad(off, (const byte*)src);
-}
-
-/// \brief Stores a vector to a byte array
-/// \tparam T vector type
-/// \param src the vector
-/// \param dest the byte array
-/// \details Stores a vector in big endian format to a byte array.
-///   VectorStoreBE will swap endianess on little endian systems.
-/// \note VectorStoreBE does not require an aligned array.
-/// \since Crypto++ 6.0
-template <class T>
-inline void VectorStoreBE(const T& src, byte dest[16])
-{
-#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
-    vec_xst_be((uint8x16_p)src, 0, (byte*)dest);
-#else
-# if (CRYPTOPP_BIG_ENDIAN)
-    vec_vsx_st((uint8x16_p)src, 0, (byte*)dest);
-# else
-    vec_vsx_st((uint8x16_p)Reverse(src), 0, (byte*)dest);
-# endif
-#endif
-}
-
-/// \brief Stores a vector to a byte array
-/// \tparam T vector type
-/// \param src the vector
-/// \param off offset into the dest byte array
-/// \param dest the byte array
-/// \details Stores a vector in big endian format to a byte array.
-///   VectorStoreBE will swap endianess on little endian systems.
-/// \note VectorStoreBE does not require an aligned array.
-/// \since Crypto++ 6.0
-template <class T>
-inline void VectorStoreBE(const T& src, int off, byte dest[16])
-{
-#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
-    vec_xst_be((uint8x16_p)src, off, (byte*)dest);
-#else
-# if (CRYPTOPP_BIG_ENDIAN)
-    vec_vsx_st((uint8x16_p)src, off, (byte*)dest);
-# else
-    vec_vsx_st((uint8x16_p)Reverse(src), off, (byte*)dest);
-# endif
-#endif
-}
-
-/// \brief Stores a vector to a byte array
-/// \tparam T vector type
-/// \param src the vector
-/// \param dest the byte array
-/// \details Stores a vector in native endian format to a byte array.
-/// \note VectorStore does not require an aligned array.
-/// \since Crypto++ 6.0
-template<class T>
-inline void VectorStore(const T& src, byte dest[16])
-{
-#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
-    vec_xst((uint8x16_p)src, 0, (byte*)dest);
-#else
-    vec_vsx_st((uint8x16_p)src, 0, (byte*)dest);
-#endif
-}
-
-/// \brief Stores a vector to a byte array
-/// \tparam T vector type
-/// \param src the vector
-/// \param dest the byte array
-/// \details Stores a vector in native endian format to a byte array.
-/// \note VectorStore does not require an aligned array.
-/// \since Crypto++ 6.0
-template<class T>
-inline void VectorStore(byte dest[16], const T& src)
-{
-#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
-    vec_xst((uint8x16_p)src, 0, (byte*)dest);
-#else
-    vec_vsx_st((uint8x16_p)src, 0, (byte*)dest);
-#endif
-}
-
-/// \brief Stores a vector to a byte array
-/// \tparam T vector type
-/// \param src the vector
-/// \param off offset into the dest byte array
-/// \param dest the byte array
-/// \details Stores a vector in native endian format to a byte array.
-/// \note VectorStore does not require an aligned array.
-/// \since Crypto++ 6.0
-template<class T>
-inline void VectorStore(const T& src, int off, byte dest[16])
-{
-#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
-    vec_xst((uint8x16_p)src, off, (byte*)dest);
-#else
-    vec_vsx_st((uint8x16_p)src, off, (byte*)dest);
-#endif
-}
-
-#else  // ########## Not CRYPTOPP_POWER7_AVAILABLE ##########
-
-/// \brief Loads a vector from a byte array
-/// \param src the byte array
-/// \details Loads a vector in native endian format from a byte array.
-/// \note VectorLoad does not require an aligned array.
-/// \since Crypto++ 6.0
-inline uint32x4_p VectorLoad(const byte src[16])
-{
-    if (IsAlignedOn(src, 16))
-    {
-        return (uint32x4_p)vec_ld(0, src);
-    }
-    else
-    {
-        // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
-        const uint8x16_p perm = vec_lvsl(0, src);
-        const uint8x16_p low = vec_ld(0, src);
-        const uint8x16_p high = vec_ld(15, src);
-        return (uint32x4_p)vec_perm(low, high, perm);
-    }
-}
-
-/// \brief Loads a vector from a byte array
-/// \param src the byte array
-/// \param off offset into the src byte array
-/// \details Loads a vector in native endian format from a byte array.
-/// \note VectorLoad does not require an aligned array.
-/// \since Crypto++ 6.0
-inline uint32x4_p VectorLoad(int off, const byte src[16])
-{
-    if (IsAlignedOn(src, 16))
-    {
-        return (uint32x4_p)vec_ld(off, src);
-    }
-    else
-    {
-        // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
-        const uint8x16_p perm = vec_lvsl(off, src);
-        const uint8x16_p low = vec_ld(off, src);
-        const uint8x16_p high = vec_ld(15, src);
-        return (uint32x4_p)vec_perm(low, high, perm);
-    }
-}
-
-/// \brief Loads a vector from a word array
-/// \param src the word array
-/// \details Loads a vector in native endian format from a word array.
-/// \note VectorLoad does not require an aligned array.
-/// \since Crypto++ 6.0
-inline uint32x4_p VectorLoad(const word32 src[4])
-{
-    return VectorLoad((const byte*)src);
-}
-
-/// \brief Loads a vector from a word array
-/// \param src the word array
-/// \param off offset into the src word array
-/// \details Loads a vector in native endian format from a word array.
-/// \note VectorLoad does not require an aligned array.
-/// \since Crypto++ 6.0
-inline uint32x4_p VectorLoad(int off, const word32 src[4])
-{
-    return VectorLoad(off, (const byte*)src);
-}
-
-/// \brief Loads a vector from a byte array
-/// \param src the byte array
-/// \details Loads a vector in big endian format from a byte array.
-///   VectorLoadBE will swap endianess on little endian systems.
-/// \note VectorLoadBE() does not require an aligned array.
-/// \since Crypto++ 6.0
-inline uint32x4_p VectorLoadBE(const byte src[16])
-{
-#if (CRYPTOPP_BIG_ENDIAN)
-    return (uint32x4_p)VectorLoad(src);
-#else
-    return (uint32x4_p)Reverse(VectorLoad(src));
-#endif
-}
-
-template<class T>
-inline void VectorStore(const T& data, byte dest[16])
-{
-    if (IsAlignedOn(dest, 16))
-    {
-        vec_st((uint8x16_p)data, 0,  dest);
-    }
-    else
-    {
-        // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
-        uint8x16_p perm = (uint8x16_p)vec_perm(data, data, vec_lvsr(0, dest));
-        vec_ste((uint8x16_p) perm,  0, (unsigned char*) dest);
-        vec_ste((uint16x8_p) perm,  1, (unsigned short*)dest);
-        vec_ste((uint32x4_p) perm,  3, (unsigned int*)  dest);
-        vec_ste((uint32x4_p) perm,  4, (unsigned int*)  dest);
-        vec_ste((uint32x4_p) perm,  8, (unsigned int*)  dest);
-        vec_ste((uint32x4_p) perm, 12, (unsigned int*)  dest);
-        vec_ste((uint16x8_p) perm, 14, (unsigned short*)dest);
-        vec_ste((uint8x16_p) perm, 15, (unsigned char*) dest);
-    }
-}
-
-/// \brief Stores a vector to a byte array
-/// \tparam T vector type
-/// \param src the vector
-/// \param dest the byte array
-/// \details Stores a vector in big endian format to a byte array.
-///   VectorStoreBE will swap endianess on little endian systems.
-/// \note VectorStoreBE does not require an aligned array.
-/// \since Crypto++ 6.0
-template <class T>
-inline void VectorStoreBE(const T& src, byte dest[16])
-{
-#if (CRYPTOPP_BIG_ENDIAN)
-    VectorStore(src, dest);
-#else
-    VectorStore(Reverse(src), dest);
-#endif
-}
-
-#endif  // POWER4/POWER7 load and store
-
-// POWER8 crypto
 #if defined(_ARCH_PWR8)

 /// \brief One round of AES encryption
@ -690,7 +744,7 @@ inline void VectorStoreBE(const T& src, byte dest[16])
 ///   using subkey key. The return vector is the same type as vec1.
 /// \since Crypto++ 6.0
 template <class T1, class T2>
-inline T1 VectorEncrypt(const T1& state, const T2& key)
+inline T1 VectorEncrypt(const T1 state, const T2 key)
 {
 #if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
    return (T1)__vcipher((uint8x16_p)state, (uint8x16_p)key);
@ -710,7 +764,7 @@ inline T1 VectorEncrypt(const T1& state, const T2& key)
 ///   of state using subkey key. The return vector is the same type as vec1.
 /// \since Crypto++ 6.0
 template <class T1, class T2>
-inline T1 VectorEncryptLast(const T1& state, const T2& key)
+inline T1 VectorEncryptLast(const T1 state, const T2 key)
 {
 #if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
    return (T1)__vcipherlast((uint8x16_p)state, (uint8x16_p)key);
@ -730,7 +784,7 @@ inline T1 VectorEncryptLast(const T1& state, const T2& key)
 ///   using subkey key. The return vector is the same type as vec1.
 /// \since Crypto++ 6.0
 template <class T1, class T2>
-inline T1 VectorDecrypt(const T1& state, const T2& key)
+inline T1 VectorDecrypt(const T1 state, const T2 key)
 {
 #if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
    return (T1)__vncipher((uint8x16_p)state, (uint8x16_p)key);
@ -750,7 +804,7 @@ inline T1 VectorDecrypt(const T1& state, const T2& key)
 ///   of state using subkey key. The return vector is the same type as vec1.
 /// \since Crypto++ 6.0
 template <class T1, class T2>
-inline T1 VectorDecryptLast(const T1& state, const T2& key)
+inline T1 VectorDecryptLast(const T1 state, const T2 key)
 {
 #if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
    return (T1)__vncipherlast((uint8x16_p)state, (uint8x16_p)key);
@ -761,10 +815,6 @@ inline T1 VectorDecryptLast(const T1& state, const T2& key)
 #endif
 }

-#endif  // POWER8 crypto
-
-#if defined(_ARCH_PWR8)
-
 /// \brief SHA256 Sigma functions
 /// \tparam func function
 /// \tparam subfunc sub-function
@ -774,7 +824,7 @@ inline T1 VectorDecryptLast(const T1& state, const T2& key)
 ///   func and subfunc. The return vector is the same type as vec.
 /// \since Crypto++ 6.0
 template <int func, int subfunc, class T>
-inline T VectorSHA256(const T& vec)
+inline T VectorSHA256(const T vec)
 {
 #if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
    return (T)__vshasigmaw((uint32x4_p)vec, func, subfunc);
@ -794,7 +844,7 @@ inline T VectorSHA256(const T& vec)
 ///   func and subfunc. The return vector is the same type as vec.
 /// \since Crypto++ 6.0
 template <int func, int subfunc, class T>
-inline T VectorSHA512(const T& vec)
+inline T VectorSHA512(const T vec)
 {
 #if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
    return (T)__vshasigmad((uint64x2_p)vec, func, subfunc);
@ -805,10 +855,14 @@ inline T VectorSHA512(const T& vec)
 #endif
 }

-#endif  // POWER8 crypto
+#endif  // _ARCH_PWR8

-#endif  // CRYPTOPP_ALTIVEC_AVAILABLE
+#endif  // _ALTIVEC_

 NAMESPACE_END

+#if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE
+# pragma GCC diagnostic pop
+#endif
+
 #endif  // CRYPTOPP_PPC_CRYPTO_H