From 225ab6cb7be252135d43b275fb0b2447036985ea Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Wed, 14 Nov 2018 08:19:13 -0500 Subject: [PATCH] Drop ChaCha requirements to POWER7 This costs about 0.6 cpb (700 MB/s on GCC112), but it makes the faster algorithm available to more machines. In the future we may want to provide both POWER7 and POWER8 --- chacha.cpp | 24 ++++++++++++------------ chacha_simd.cpp | 37 +++++++++++++++++++++++++++++-------- 2 files changed, 41 insertions(+), 20 deletions(-) diff --git a/chacha.cpp b/chacha.cpp index fe438507..333a58a6 100644 --- a/chacha.cpp +++ b/chacha.cpp @@ -24,8 +24,8 @@ extern void ChaCha_OperateKeystream_SSE2(const word32 *state, const byte* input, extern void ChaCha_OperateKeystream_AVX2(const word32 *state, const byte* input, byte *output, unsigned int rounds); #endif -#if (CRYPTOPP_POWER8_AVAILABLE) -extern void ChaCha_OperateKeystream_POWER8(const word32 *state, const byte* input, byte *output, unsigned int rounds); +#if (CRYPTOPP_POWER7_AVAILABLE) +extern void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte *output, unsigned int rounds); #endif #define CHACHA_QUARTER_ROUND(a,b,c,d) \ @@ -81,9 +81,9 @@ std::string ChaCha_Policy::AlgorithmProvider() const return "NEON"; else #endif -#if (CRYPTOPP_POWER8_AVAILABLE) - if (HasPower8()) - return "Power8"; +#if (CRYPTOPP_POWER7_AVAILABLE) + if (HasPower7()) + return "Power7"; else #endif return "C++"; @@ -139,8 +139,8 @@ unsigned int ChaCha_Policy::GetAlignment() const return 16; else #endif -#if (CRYPTOPP_POWER8_AVAILABLE) - if (HasPower8()) +#if (CRYPTOPP_POWER7_AVAILABLE) + if (HasPower7()) return 16; else #endif @@ -164,8 +164,8 @@ unsigned int ChaCha_Policy::GetOptimalBlockSize() const return 4*BYTES_PER_ITERATION; else #endif -#if (CRYPTOPP_POWER8_AVAILABLE) - if (HasPower8()) +#if (CRYPTOPP_POWER7_AVAILABLE) + if (HasPower7()) return 4*BYTES_PER_ITERATION; else #endif @@ -245,13 +245,13 @@ void ChaCha_Policy::OperateKeystream(KeystreamOperation operation, } #endif -#if (CRYPTOPP_POWER8_AVAILABLE) - if (HasPower8()) +#if (CRYPTOPP_POWER7_AVAILABLE) + if (HasPower7()) { while (iterationCount >= 4 && MultiBlockSafe(4)) { const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL; - ChaCha_OperateKeystream_POWER8(m_state, xorInput ? input : NULLPTR, output, m_rounds); + ChaCha_OperateKeystream_POWER7(m_state, xorInput ? input : NULLPTR, output, m_rounds); // MultiBlockSafe avoids overflow on the counter words m_state[12] += 4; diff --git a/chacha_simd.cpp b/chacha_simd.cpp index 6e619281..5190f161 100644 --- a/chacha_simd.cpp +++ b/chacha_simd.cpp @@ -54,7 +54,7 @@ # include #endif -#if defined(CRYPTOPP_POWER8_AVAILABLE) +#if defined(CRYPTOPP_POWER7_AVAILABLE) # include "ppc_simd.h" #endif @@ -201,16 +201,25 @@ inline __m128i RotateLeft<16>(const __m128i val) #endif // CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE -// **************************** POWER8 **************************** // +// **************************** POWER7 **************************** // -#if (CRYPTOPP_POWER8_AVAILABLE) +#if (CRYPTOPP_POWER7_AVAILABLE) + +// POWER8 is optional and runs about 0.6 cpb faster because +// of the native 64-bit vector add. That's about 700 MB/s on +// GCC112 from the compile farm. Use -mcpu=power8 to engage +// POWER8. POWER7 lacks 64-bit element support, so code built +// with -mcpu=power8 will SIGILL on POWER7 machines. using CryptoPP::uint8x16_p; using CryptoPP::uint32x4_p; -using CryptoPP::uint64x2_p; using CryptoPP::VectorLoad; using CryptoPP::VectorStore; +#if (_ARCH_PWR8 || _ARCH_PWR9) +using CryptoPP::uint64x2_p; +#endif + // Permutes bytes in packed 32-bit words to little endian. // State is already in proper endian order. Input and // output must be permuted during load and save. @@ -290,10 +299,22 @@ inline uint32x4_p Shuffle<3>(const uint32x4_p& val) // Helper to perform 64-bit addition across two elements of 32-bit vectors inline uint32x4_p VectorAdd64(const uint32x4_p& a, const uint32x4_p& b) { +#if (_ARCH_PWR8 || _ARCH_PWR9) return (uint32x4_p)vec_add((uint64x2_p)a, (uint64x2_p)b); +#else + // The carry mask selects carries from elements 1 and 3 and sets remaining + // elements to 0. The mask also shifts the carried values left by 4 bytes + // so the carries are added to elements 0 and 2. + const uint8x16_p cmask = {4,5,6,7, 16,16,16,16, 12,13,14,15, 16,16,16,16}; + const uint32x4_p zero = {0, 0, 0, 0}; + + uint32x4_p cy = vec_addc(a, b); + cy = vec_perm(cy, zero, cmask); + return vec_add(vec_add(a, b), cy); +#endif } -#endif // CRYPTOPP_POWER8_AVAILABLE +#endif // CRYPTOPP_POWER7_AVAILABLE ANONYMOUS_NAMESPACE_END @@ -835,9 +856,9 @@ void ChaCha_OperateKeystream_SSE2(const word32 *state, const byte* input, byte * #endif // CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE -#if (CRYPTOPP_POWER8_AVAILABLE) +#if (CRYPTOPP_POWER7_AVAILABLE) -void ChaCha_OperateKeystream_POWER8(const word32 *state, const byte* input, byte *output, unsigned int rounds) +void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte *output, unsigned int rounds) { const uint32x4_p state0 = VectorLoad(state + 0*4); const uint32x4_p state1 = VectorLoad(state + 1*4); @@ -1099,6 +1120,6 @@ void ChaCha_OperateKeystream_POWER8(const word32 *state, const byte* input, byte VectorStore32LE(output + 15*16, r3_3); } -#endif // CRYPTOPP_POWER8_AVAILABLE +#endif // CRYPTOPP_POWER7_AVAILABLE NAMESPACE_END