Drop ChaCha requirements to POWER7

This costs about 0.6 cpb (700 MB/s on GCC112), but it makes the faster algorithm available to more machines. In the future we may want to provide both POWER7 and POWER8
2018-11-14 08:19:13 -05:00 · 2018-11-14 08:19:13 -05:00 · 225ab6cb7b
parent b3941a433d
commit 225ab6cb7b
2 changed files with 41 additions and 20 deletions
--- a/chacha.cpp
+++ b/chacha.cpp
@ -24,8 +24,8 @@ extern void ChaCha_OperateKeystream_SSE2(const word32 *state, const byte* input,
 extern void ChaCha_OperateKeystream_AVX2(const word32 *state, const byte* input, byte *output, unsigned int rounds);
 #endif
-#if (CRYPTOPP_POWER8_AVAILABLE)
+#if (CRYPTOPP_POWER7_AVAILABLE)
-extern void ChaCha_OperateKeystream_POWER8(const word32 *state, const byte* input, byte *output, unsigned int rounds);
+extern void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte *output, unsigned int rounds);
 #endif
 #define CHACHA_QUARTER_ROUND(a,b,c,d) \
@ -81,9 +81,9 @@ std::string ChaCha_Policy::AlgorithmProvider() const
        return "NEON";
    else
 #endif
-#if (CRYPTOPP_POWER8_AVAILABLE)
+#if (CRYPTOPP_POWER7_AVAILABLE)
-    if (HasPower8())
+    if (HasPower7())
-        return "Power8";
+        return "Power7";
    else
 #endif
    return "C++";
@ -139,8 +139,8 @@ unsigned int ChaCha_Policy::GetAlignment() const
        return 16;
    else
 #endif
-#if (CRYPTOPP_POWER8_AVAILABLE)
+#if (CRYPTOPP_POWER7_AVAILABLE)
-    if (HasPower8())
+    if (HasPower7())
        return 16;
    else
 #endif
@ -164,8 +164,8 @@ unsigned int ChaCha_Policy::GetOptimalBlockSize() const
        return 4*BYTES_PER_ITERATION;
    else
 #endif
-#if (CRYPTOPP_POWER8_AVAILABLE)
+#if (CRYPTOPP_POWER7_AVAILABLE)
-    if (HasPower8())
+    if (HasPower7())
        return 4*BYTES_PER_ITERATION;
    else
 #endif
@ -245,13 +245,13 @@ void ChaCha_Policy::OperateKeystream(KeystreamOperation operation,
        }
 #endif
-#if (CRYPTOPP_POWER8_AVAILABLE)
+#if (CRYPTOPP_POWER7_AVAILABLE)
-        if (HasPower8())
+        if (HasPower7())
        {
            while (iterationCount >= 4 && MultiBlockSafe(4))
            {
                const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL;
-                ChaCha_OperateKeystream_POWER8(m_state, xorInput ? input : NULLPTR, output, m_rounds);
+                ChaCha_OperateKeystream_POWER7(m_state, xorInput ? input : NULLPTR, output, m_rounds);
                // MultiBlockSafe avoids overflow on the counter words
                m_state[12] += 4;
--- a/chacha_simd.cpp
+++ b/chacha_simd.cpp
@ -54,7 +54,7 @@
 # include <arm_acle.h>
 #endif
-#if defined(CRYPTOPP_POWER8_AVAILABLE)
+#if defined(CRYPTOPP_POWER7_AVAILABLE)
 # include "ppc_simd.h"
 #endif
@ -201,16 +201,25 @@ inline __m128i RotateLeft<16>(const __m128i val)
 #endif  // CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE
-// **************************** POWER8 **************************** //
+// **************************** POWER7 **************************** //
-#if (CRYPTOPP_POWER8_AVAILABLE)
+#if (CRYPTOPP_POWER7_AVAILABLE)
 // POWER8 is optional and runs about 0.6 cpb faster because
 // of the native 64-bit vector add. That's about 700 MB/s on
 // GCC112 from the compile farm. Use -mcpu=power8 to engage
 // POWER8. POWER7 lacks 64-bit element support, so code built
 // with -mcpu=power8 will SIGILL on POWER7 machines.
 using CryptoPP::uint8x16_p;
 using CryptoPP::uint32x4_p;
 using CryptoPP::uint64x2_p;
 using CryptoPP::VectorLoad;
 using CryptoPP::VectorStore;
 #if (_ARCH_PWR8 || _ARCH_PWR9)
 using CryptoPP::uint64x2_p;
 #endif
 // Permutes bytes in packed 32-bit words to little endian.
 // State is already in proper endian order. Input and
 // output must be permuted during load and save.
@ -290,10 +299,22 @@ inline uint32x4_p Shuffle<3>(const uint32x4_p& val)
 // Helper to perform 64-bit addition across two elements of 32-bit vectors
 inline uint32x4_p VectorAdd64(const uint32x4_p& a, const uint32x4_p& b)
 {
 #if (_ARCH_PWR8 || _ARCH_PWR9)
    return (uint32x4_p)vec_add((uint64x2_p)a, (uint64x2_p)b);
 #else
 	// The carry mask selects carries from elements 1 and 3 and sets remaining
 	// elements to 0. The mask also shifts the carried values left by 4 bytes
 	// so the carries are added to elements 0 and 2.
 	const uint8x16_p cmask = {4,5,6,7, 16,16,16,16, 12,13,14,15, 16,16,16,16};
 	const uint32x4_p zero = {0, 0, 0, 0};
 	uint32x4_p cy = vec_addc(a, b);
    cy = vec_perm(cy, zero, cmask);
    return vec_add(vec_add(a, b), cy);
 #endif
 }
-#endif  // CRYPTOPP_POWER8_AVAILABLE
+#endif  // CRYPTOPP_POWER7_AVAILABLE
 ANONYMOUS_NAMESPACE_END
@ -835,9 +856,9 @@ void ChaCha_OperateKeystream_SSE2(const word32 *state, const byte* input, byte *
 #endif  // CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE
-#if (CRYPTOPP_POWER8_AVAILABLE)
+#if (CRYPTOPP_POWER7_AVAILABLE)
-void ChaCha_OperateKeystream_POWER8(const word32 *state, const byte* input, byte *output, unsigned int rounds)
+void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte *output, unsigned int rounds)
 {
    const uint32x4_p state0 = VectorLoad(state + 0*4);
    const uint32x4_p state1 = VectorLoad(state + 1*4);
@ -1099,6 +1120,6 @@ void ChaCha_OperateKeystream_POWER8(const word32 *state, const byte* input, byte
    VectorStore32LE(output + 15*16, r3_3);
 }
-#endif  // CRYPTOPP_POWER8_AVAILABLE
+#endif  // CRYPTOPP_POWER7_AVAILABLE
 NAMESPACE_END